diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72092ea1c8a23c7425d675b3297400829738b652 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6925b044e24324bf914b67240405e919fe68314c48ad3de93e4a9d7611b9c04 +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..067eb8bf7999b333098987ee733880711efb3e55 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad03aeea852fda7a99d2110bb343a2d7500127cd7c425b8190bf2da6eeec414 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..221d471b51860418e21af4c58f7e0822907a0d60 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5061412d162761e71d5f829e6c00bf103fb007b51f6501afafd6a97d8a29b695 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db08bb052c3a7e223bf1b4ee4ffcafaa43c9617f --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c1dc264c8acdbef33234dc17ea8bc3dac7b0c47963c46a11315e9b208f6b10 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4b4bc3aee75281e3bcaf7af7661452d76110468 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e00dd19535a6983c94e09dcdaafb26f852c6cfd55ad55b3351d3029b6682bce +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..586ebc6f606531ef6b73484795523d4bf2d7e220 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5474595211fa1433f708b64299f338690014a59684143a97b8a483eada816d16 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..632567bfd3c059a085cf6404c8b233082e40c296 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7621c7455492cffefdd61a32a7844e9ac4e3d3fe00f158be226d8d4d45902e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83e0b56c8f672eb6e6370e1fd99da06b70d9180e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd59ea28dbe51f9235104b5d25240be33127634846ff18dfb65322e6e520dd77 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d795c120779bd5c5fabff658d00e06b261034a3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b13a0c3ce6949efdfca603facc714ec81e90568d2804511c5589795c2d812620 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a80cde77a79cc6f427d18b8018fd5ed22e058adc --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8d0eaf863add0be1a69fe900dc7b4accb831099a887ac4c07e610342a09eb0d +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8aea8ddaa8ba461ac7f3cbbd12f3dbef8c0651e1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4f53639a0af1f68261b80b065bf7d10ceb0077f395d9665523dcc3874edfa28 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c74553480eddac324df3dfdc856be4330d80053 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac45e6b69a85f64aed28df5cd28b917cd5ded2a4bd2dea492b0a9308e19be22 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4755b661a69fe596116e21f3e2e7c131f5a00e9d --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e8c9747629233f5c2fc2ea351c6f8a8644066b137ffc58a33cd006e15c38a2d +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8fd8f8937c6b26b4a839599ef03678b4c7fb3fe --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:160343615ee350fe6385f4873ee8220599ca55478324ff501e35e1806a61d1c4 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d15d1d452f42ceec78f10b0f6931844050e60e6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6a726d6d266bd8be225df003e78a8423aca8ca33947417a0c92cac3af3fe48 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b2d58f4f519f307c638b56c24430ce29cd41e2b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0542330bad8c72e3d4311ef7fa2d380919d9a87c4c830836520f1909e012e48 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab6ac58f8ce66a2694c8f306732a9735fec739fd --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5b01d5fb1b01adb3d8d6fd7fd823a9cbde23f0abac255caefa79e4ea82ef13 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a1fb08c48e9d34df783eb19e7c9d1caf0ed386 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec37c3a15b8d061312402391f2fddb52d623a1416d6d2879a30f184450d844f +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9522cc317fc06bfa6bf7d61d95cb8951cfb6729a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.54272461, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100", + "epoch": 2.6315789473684212, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.750705999680513, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": 0.0028198242653161287, + "logits/rejected": 0.02851562574505806, + "logps/chosen": -708.7999877929688, + "logps/rejected": -666.0, + "loss": 1.101220703125, + "memory(GiB)": 36.36, + "nll_loss": 0.8804687261581421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 6.09375, + "rewards/margins": 3.2359375953674316, + "rewards/rejected": 2.8531250953674316, + "step": 25, + "train_speed(iter/s)": 0.402214 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1620640103930189, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.14511719346046448, + "logits/rejected": -0.010668945498764515, + "logps/chosen": -493.20001220703125, + "logps/rejected": -559.5999755859375, + "loss": 0.742919921875, + "memory(GiB)": 36.36, + "nll_loss": 0.717968761920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.668749809265137, + "rewards/margins": 4.993750095367432, + "rewards/rejected": 2.6703124046325684, + "step": 30, + "train_speed(iter/s)": 0.409863 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.5082652670453853, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.0003417968691792339, + "logits/rejected": -0.09726562350988388, + "logps/chosen": -783.2000122070312, + "logps/rejected": -649.2000122070312, + "loss": 0.8309814453125, + "memory(GiB)": 36.36, + "nll_loss": 0.778124988079071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.25, + "rewards/margins": 5.650000095367432, + "rewards/rejected": 2.6031250953674316, + "step": 35, + "train_speed(iter/s)": 0.415557 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1139559192769446, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": 0.05820312350988388, + "logits/rejected": -0.06640625, + "logps/chosen": -599.5999755859375, + "logps/rejected": -630.7999877929688, + "loss": 0.725, + "memory(GiB)": 36.36, + "nll_loss": 0.782031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.856249809265137, + "rewards/margins": 7.199999809265137, + "rewards/rejected": 1.6640625, + "step": 40, + "train_speed(iter/s)": 0.419666 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.73828125, + "eval_logits/rejected": 0.52734375, + "eval_logps/chosen": -320.0, + "eval_logps/rejected": -1208.0, + "eval_loss": 0.583984375, + "eval_nll_loss": 0.57421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 6.0, + "eval_rewards/rejected": 4.0625, + "eval_runtime": 1.0568, + "eval_samples_per_second": 3.785, + "eval_steps_per_second": 0.946, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.5832429010086848, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": 0.10170898586511612, + "logits/rejected": 0.05078125, + "logps/chosen": -606.7999877929688, + "logps/rejected": -670.4000244140625, + "loss": 0.6896728515625, + "memory(GiB)": 36.36, + "nll_loss": 0.6890624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.175000190734863, + "rewards/margins": 9.737500190734863, + "rewards/rejected": 0.4398437440395355, + "step": 45, + "train_speed(iter/s)": 0.41586 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.49952125769013667, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -0.04960937425494194, + "logits/rejected": -0.02602539025247097, + "logps/chosen": -610.4000244140625, + "logps/rejected": -663.5999755859375, + "loss": 0.64876708984375, + "memory(GiB)": 36.36, + "nll_loss": 0.647656261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.912500381469727, + "rewards/margins": 10.362500190734863, + "rewards/rejected": 0.546093761920929, + "step": 50, + "train_speed(iter/s)": 0.4198 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.31824407050814685, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": 0.22453613579273224, + "logits/rejected": 0.23637695610523224, + "logps/chosen": -608.4000244140625, + "logps/rejected": -621.5999755859375, + "loss": 0.68779296875, + "memory(GiB)": 36.36, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.149999618530273, + "rewards/margins": 10.612500190734863, + "rewards/rejected": 0.55908203125, + "step": 55, + "train_speed(iter/s)": 0.423222 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23207078350156118, + "learning_rate": 5e-05, + "logits/chosen": 0.3818359375, + "logits/rejected": -0.07106933742761612, + "logps/chosen": -654.0, + "logps/rejected": -466.79998779296875, + "loss": 0.577880859375, + "memory(GiB)": 36.36, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.862500190734863, + "rewards/margins": 11.5, + "rewards/rejected": 0.3648925721645355, + "step": 60, + "train_speed(iter/s)": 0.425778 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.6015625, + "eval_logits/rejected": 0.65234375, + "eval_logps/chosen": -304.0, + "eval_logps/rejected": -1192.0, + "eval_loss": 0.55859375, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": 5.34375, + "eval_runtime": 1.0788, + "eval_samples_per_second": 3.708, + "eval_steps_per_second": 0.927, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6795800719500261, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.07011719048023224, + "logits/rejected": 0.01972656324505806, + "logps/chosen": -590.4000244140625, + "logps/rejected": -555.2000122070312, + "loss": 0.60504150390625, + "memory(GiB)": 36.36, + "nll_loss": 0.604296863079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 10.949999809265137, + "rewards/rejected": 0.851757824420929, + "step": 65, + "train_speed(iter/s)": 0.423328 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.580148749174503, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": 0.0538330078125, + "logits/rejected": 0.05624999850988388, + "logps/chosen": -501.79998779296875, + "logps/rejected": -551.4000244140625, + "loss": 0.6089111328125, + "memory(GiB)": 36.36, + "nll_loss": 0.6078125238418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.725000381469727, + "rewards/margins": 10.162500381469727, + "rewards/rejected": 0.565625011920929, + "step": 70, + "train_speed(iter/s)": 0.425442 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.6687994812508333, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": 0.16015625, + "logits/rejected": 0.23457030951976776, + "logps/chosen": -650.0, + "logps/rejected": -695.2000122070312, + "loss": 0.688037109375, + "memory(GiB)": 36.36, + "nll_loss": 0.688281238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.574999809265137, + "rewards/margins": 12.662500381469727, + "rewards/rejected": -0.09501953423023224, + "step": 75, + "train_speed(iter/s)": 0.427366 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.779151521456302, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": 0.17226561903953552, + "logits/rejected": 0.12241210788488388, + "logps/chosen": -528.7999877929688, + "logps/rejected": -652.7999877929688, + "loss": 0.6132568359375, + "memory(GiB)": 36.36, + "nll_loss": 0.602343738079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.824999809265137, + "rewards/margins": 12.537500381469727, + "rewards/rejected": -0.6976562738418579, + "step": 80, + "train_speed(iter/s)": 0.428029 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.53125, + "eval_logits/rejected": 0.6875, + "eval_logps/chosen": -298.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.56982421875, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.1875, + "eval_rewards/margins": 6.03125, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.013, + "eval_samples_per_second": 3.949, + "eval_steps_per_second": 0.987, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.7098613362463773, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": 0.06606445461511612, + "logits/rejected": 0.2003173828125, + "logps/chosen": -527.2000122070312, + "logps/rejected": -765.5999755859375, + "loss": 0.62239990234375, + "memory(GiB)": 36.36, + "nll_loss": 0.622265636920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.862500190734863, + "rewards/margins": 11.199999809265137, + "rewards/rejected": 1.6593749523162842, + "step": 85, + "train_speed(iter/s)": 0.425977 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.37164934301407404, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": 0.2582031190395355, + "logits/rejected": 0.01315918006002903, + "logps/chosen": -634.4000244140625, + "logps/rejected": -591.5999755859375, + "loss": 0.617724609375, + "memory(GiB)": 36.36, + "nll_loss": 0.6167968511581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.512499809265137, + "rewards/margins": 12.149999618530273, + "rewards/rejected": 0.37109375, + "step": 90, + "train_speed(iter/s)": 0.42749 + }, + { + "epoch": 2.5, + "grad_norm": 0.2667601385654993, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": 0.2818359434604645, + "logits/rejected": 0.369140625, + "logps/chosen": -699.2000122070312, + "logps/rejected": -696.7999877929688, + "loss": 0.70126953125, + "memory(GiB)": 36.36, + "nll_loss": 0.7015625238418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 12.287500381469727, + "rewards/rejected": 1.1515624523162842, + "step": 95, + "train_speed(iter/s)": 0.428148 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.5516332791700669, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": 0.06599120795726776, + "logits/rejected": 0.1904296875, + "logps/chosen": -518.7999877929688, + "logps/rejected": -540.7999877929688, + "loss": 0.6071044921875, + "memory(GiB)": 45.6, + "nll_loss": 0.6070312261581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.637499809265137, + "rewards/margins": 12.212499618530273, + "rewards/rejected": 0.4359374940395355, + "step": 100, + "train_speed(iter/s)": 0.428781 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -0.515625, + "eval_logits/rejected": 0.69140625, + "eval_logps/chosen": -298.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.542724609375, + "eval_nll_loss": 0.51953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 6.1875, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.0634, + "eval_samples_per_second": 3.762, + "eval_steps_per_second": 0.94, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 54832257204224.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2aaa53a12a64dfa0aab167b9233cc7544684733 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0476d2cf16b00d5aaa725d714afd28ec1c6d8088393a80bc4e2aa7661ef4b092 +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fe5ac443f381fdcdf0eb1995c1905db2f07e4aa --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40f928849f1d2e4fbdc058bbacb2e56597f601f79605761db4913334f39a345 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ed1b7b0ea07369eda3be11fc3c086b707931940 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985d36c403fee8f99fa1454391641d9a897dc65f2195603248fc8a4e7992e28d +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc70dda05aa7fecc13881435a9dc467ef162d98c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb36aacf68f2bd98b43c7922a9d4b3efcc46f243e4a4b20426abfb96dc32a018 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e184be0fe0739b8c4c1abe99044635a7a6197997 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20df96208f17aded690c8e923ea66dd38a83d1bfb7aa9e453e0cc73ed8cb2f5 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..114bf1cc2aeeffcae9cd206623f91ad5d2e0506a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9756b881d480be5c946e9aa4eff66ad5aa54377c9690eaa2bb867605f07ac9d2 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f296efeae79a0b31bace1e17c3f1d4e75b51b00 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e537096bcba87c9f76f7d5b1541b215e7340bcaba0986dba77ca7b4bbef2ea5 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..161fb5fd087e51704df651b8faf4e943f392e3e9 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a0a711f1d78e3a5b2566c6fba0bcaa41a660aef52dbb17f748ccd591faf88ca +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec4b07993e1faadff071cc2d68c2cfc54cef5ab6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6926922f5d8ac836bb56627f053395ca205a470c2fd35c70855a0b7d8fcaab2a +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..712531b3929eb216d6f6ab3c15e9916ea3bdc9a9 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b94a46b514e372b6aa285da91cc963d4fc73147913e67ca82c5d961ef8ec4528 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88d778b9f3e69f0fe181daaa6cc07a06b0074db9 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e370c23ef3216d3ab68d74acf909be24fd40c73847767c97eaa781eba304445 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..796cb5244ac5b8068ee7aeab5e7d32f07a0509f4 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac94ddc95914b5325ded9296ddecc8f122b3be1ab10aec33d1783bc02e578329 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1491494e94010f12b0d5ff44cefeb8ae8a01e3a6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d4b2ae50c4f1496ca06c736a915fbf08f63c8323172ca7543727bbb50c2b44 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8a7a4dcc7810969fb3590388115d0c09ff7ce9b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc37ff291e8943cff6f721675a0488b3bc1e22c3b6582a1cad5c85ed3afc22a5 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28a53271e53663ae1c856709cc4b8511e00a07d6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7778992f5178d314168b21b0a163724cc321a625adc07d43233dfc81a6909d63 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..546d29576ed451ebd6ce3f89c8e924388467906a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7528548b4bdb5df70ac508f91977ce7ef4294bd65648b7ded3481daf2fefc52 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56bec4a426d0bdc0c8e1817f8c8bda068ade7be3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28928e687f119726af90f7fb9b37deb93b2f9a267ae0e24f5a16cf2271c4bb0 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/latest new file mode 100644 index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/latest @@ -0,0 +1 @@ +global_step114 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a86ac614a477eb67963adb2c8c07f37c79ded059 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7a9fd18bda7faa50931342147a7de5605bed0f91f6c70d821e84b7bf8f444f +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60f2a3d80f479658d352d7a3668b63240d708f0f --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/trainer_state.json @@ -0,0 +1,549 @@ +{ + "best_metric": 0.54199219, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114", + "epoch": 3.0, + "eval_steps": 20, + "global_step": 114, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.750705999680513, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": 0.0028198242653161287, + "logits/rejected": 0.02851562574505806, + "logps/chosen": -708.7999877929688, + "logps/rejected": -666.0, + "loss": 1.101220703125, + "memory(GiB)": 36.36, + "nll_loss": 0.8804687261581421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 6.09375, + "rewards/margins": 3.2359375953674316, + "rewards/rejected": 2.8531250953674316, + "step": 25, + "train_speed(iter/s)": 0.402214 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1620640103930189, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.14511719346046448, + "logits/rejected": -0.010668945498764515, + "logps/chosen": -493.20001220703125, + "logps/rejected": -559.5999755859375, + "loss": 0.742919921875, + "memory(GiB)": 36.36, + "nll_loss": 0.717968761920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.668749809265137, + "rewards/margins": 4.993750095367432, + "rewards/rejected": 2.6703124046325684, + "step": 30, + "train_speed(iter/s)": 0.409863 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.5082652670453853, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.0003417968691792339, + "logits/rejected": -0.09726562350988388, + "logps/chosen": -783.2000122070312, + "logps/rejected": -649.2000122070312, + "loss": 0.8309814453125, + "memory(GiB)": 36.36, + "nll_loss": 0.778124988079071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.25, + "rewards/margins": 5.650000095367432, + "rewards/rejected": 2.6031250953674316, + "step": 35, + "train_speed(iter/s)": 0.415557 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1139559192769446, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": 0.05820312350988388, + "logits/rejected": -0.06640625, + "logps/chosen": -599.5999755859375, + "logps/rejected": -630.7999877929688, + "loss": 0.725, + "memory(GiB)": 36.36, + "nll_loss": 0.782031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.856249809265137, + "rewards/margins": 7.199999809265137, + "rewards/rejected": 1.6640625, + "step": 40, + "train_speed(iter/s)": 0.419666 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.73828125, + "eval_logits/rejected": 0.52734375, + "eval_logps/chosen": -320.0, + "eval_logps/rejected": -1208.0, + "eval_loss": 0.583984375, + "eval_nll_loss": 0.57421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 6.0, + "eval_rewards/rejected": 4.0625, + "eval_runtime": 1.0568, + "eval_samples_per_second": 3.785, + "eval_steps_per_second": 0.946, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.5832429010086848, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": 0.10170898586511612, + "logits/rejected": 0.05078125, + "logps/chosen": -606.7999877929688, + "logps/rejected": -670.4000244140625, + "loss": 0.6896728515625, + "memory(GiB)": 36.36, + "nll_loss": 0.6890624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.175000190734863, + "rewards/margins": 9.737500190734863, + "rewards/rejected": 0.4398437440395355, + "step": 45, + "train_speed(iter/s)": 0.41586 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.49952125769013667, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -0.04960937425494194, + "logits/rejected": -0.02602539025247097, + "logps/chosen": -610.4000244140625, + "logps/rejected": -663.5999755859375, + "loss": 0.64876708984375, + "memory(GiB)": 36.36, + "nll_loss": 0.647656261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.912500381469727, + "rewards/margins": 10.362500190734863, + "rewards/rejected": 0.546093761920929, + "step": 50, + "train_speed(iter/s)": 0.4198 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.31824407050814685, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": 0.22453613579273224, + "logits/rejected": 0.23637695610523224, + "logps/chosen": -608.4000244140625, + "logps/rejected": -621.5999755859375, + "loss": 0.68779296875, + "memory(GiB)": 36.36, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.149999618530273, + "rewards/margins": 10.612500190734863, + "rewards/rejected": 0.55908203125, + "step": 55, + "train_speed(iter/s)": 0.423222 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23207078350156118, + "learning_rate": 5e-05, + "logits/chosen": 0.3818359375, + "logits/rejected": -0.07106933742761612, + "logps/chosen": -654.0, + "logps/rejected": -466.79998779296875, + "loss": 0.577880859375, + "memory(GiB)": 36.36, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.862500190734863, + "rewards/margins": 11.5, + "rewards/rejected": 0.3648925721645355, + "step": 60, + "train_speed(iter/s)": 0.425778 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.6015625, + "eval_logits/rejected": 0.65234375, + "eval_logps/chosen": -304.0, + "eval_logps/rejected": -1192.0, + "eval_loss": 0.55859375, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": 5.34375, + "eval_runtime": 1.0788, + "eval_samples_per_second": 3.708, + "eval_steps_per_second": 0.927, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6795800719500261, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.07011719048023224, + "logits/rejected": 0.01972656324505806, + "logps/chosen": -590.4000244140625, + "logps/rejected": -555.2000122070312, + "loss": 0.60504150390625, + "memory(GiB)": 36.36, + "nll_loss": 0.604296863079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 10.949999809265137, + "rewards/rejected": 0.851757824420929, + "step": 65, + "train_speed(iter/s)": 0.423328 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.580148749174503, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": 0.0538330078125, + "logits/rejected": 0.05624999850988388, + "logps/chosen": -501.79998779296875, + "logps/rejected": -551.4000244140625, + "loss": 0.6089111328125, + "memory(GiB)": 36.36, + "nll_loss": 0.6078125238418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.725000381469727, + "rewards/margins": 10.162500381469727, + "rewards/rejected": 0.565625011920929, + "step": 70, + "train_speed(iter/s)": 0.425442 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.6687994812508333, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": 0.16015625, + "logits/rejected": 0.23457030951976776, + "logps/chosen": -650.0, + "logps/rejected": -695.2000122070312, + "loss": 0.688037109375, + "memory(GiB)": 36.36, + "nll_loss": 0.688281238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.574999809265137, + "rewards/margins": 12.662500381469727, + "rewards/rejected": -0.09501953423023224, + "step": 75, + "train_speed(iter/s)": 0.427366 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.779151521456302, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": 0.17226561903953552, + "logits/rejected": 0.12241210788488388, + "logps/chosen": -528.7999877929688, + "logps/rejected": -652.7999877929688, + "loss": 0.6132568359375, + "memory(GiB)": 36.36, + "nll_loss": 0.602343738079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.824999809265137, + "rewards/margins": 12.537500381469727, + "rewards/rejected": -0.6976562738418579, + "step": 80, + "train_speed(iter/s)": 0.428029 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.53125, + "eval_logits/rejected": 0.6875, + "eval_logps/chosen": -298.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.56982421875, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.1875, + "eval_rewards/margins": 6.03125, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.013, + "eval_samples_per_second": 3.949, + "eval_steps_per_second": 0.987, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.7098613362463773, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": 0.06606445461511612, + "logits/rejected": 0.2003173828125, + "logps/chosen": -527.2000122070312, + "logps/rejected": -765.5999755859375, + "loss": 0.62239990234375, + "memory(GiB)": 36.36, + "nll_loss": 0.622265636920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.862500190734863, + "rewards/margins": 11.199999809265137, + "rewards/rejected": 1.6593749523162842, + "step": 85, + "train_speed(iter/s)": 0.425977 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.37164934301407404, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": 0.2582031190395355, + "logits/rejected": 0.01315918006002903, + "logps/chosen": -634.4000244140625, + "logps/rejected": -591.5999755859375, + "loss": 0.617724609375, + "memory(GiB)": 36.36, + "nll_loss": 0.6167968511581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.512499809265137, + "rewards/margins": 12.149999618530273, + "rewards/rejected": 0.37109375, + "step": 90, + "train_speed(iter/s)": 0.42749 + }, + { + "epoch": 2.5, + "grad_norm": 0.2667601385654993, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": 0.2818359434604645, + "logits/rejected": 0.369140625, + "logps/chosen": -699.2000122070312, + "logps/rejected": -696.7999877929688, + "loss": 0.70126953125, + "memory(GiB)": 36.36, + "nll_loss": 0.7015625238418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.4375, + "rewards/margins": 12.287500381469727, + "rewards/rejected": 1.1515624523162842, + "step": 95, + "train_speed(iter/s)": 0.428148 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.5516332791700669, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": 0.06599120795726776, + "logits/rejected": 0.1904296875, + "logps/chosen": -518.7999877929688, + "logps/rejected": -540.7999877929688, + "loss": 0.6071044921875, + "memory(GiB)": 45.6, + "nll_loss": 0.6070312261581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.637499809265137, + "rewards/margins": 12.212499618530273, + "rewards/rejected": 0.4359374940395355, + "step": 100, + "train_speed(iter/s)": 0.428781 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -0.515625, + "eval_logits/rejected": 0.69140625, + "eval_logps/chosen": -298.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.542724609375, + "eval_nll_loss": 0.51953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 6.1875, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.0634, + "eval_samples_per_second": 3.762, + "eval_steps_per_second": 0.94, + "step": 100 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.9085203696227837, + "learning_rate": 1.70370868554659e-06, + "logits/chosen": 0.14899902045726776, + "logits/rejected": 0.14143066108226776, + "logps/chosen": -547.0999755859375, + "logps/rejected": -508.0, + "loss": 0.6044189453125, + "memory(GiB)": 45.6, + "nll_loss": 0.603515625, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.1875, + "rewards/margins": 11.324999809265137, + "rewards/rejected": 0.8433593511581421, + "step": 105, + "train_speed(iter/s)": 0.426933 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.5233415847185563, + "learning_rate": 3.380821129028489e-07, + "logits/chosen": 0.072998046875, + "logits/rejected": 0.19794921576976776, + "logps/chosen": -565.5999755859375, + "logps/rejected": -526.2000122070312, + "loss": 0.5385528564453125, + "memory(GiB)": 45.6, + "nll_loss": 0.538281261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.399999618530273, + "rewards/margins": 11.537500381469727, + "rewards/rejected": 0.8519531488418579, + "step": 110, + "train_speed(iter/s)": 0.428375 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -0.515625, + "eval_logits/rejected": 0.6953125, + "eval_logps/chosen": -296.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.5419921875, + "eval_nll_loss": 0.51953125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.375, + "eval_rewards/margins": 6.25, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.0337, + "eval_samples_per_second": 3.87, + "eval_steps_per_second": 0.967, + "step": 114 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 62364809592832.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc1800358a55ee08640860a4e28b891f597f6e90 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b019d484850c23d8df6ed723973c3435d674c0f7473ba03015dec7e2f2ddc388 +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86d1536469d59534cedbaf3f11c24ef0702af44f --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7081fdb66e4ea2140fa89be65c043e0b9ddfbb88bca1ea20ca5b9501dd267b2a +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c29bdd14558d7acf73bfa83e2102782ff54be8a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de2d484867e452a04999791243afb19dc2fd28d32e981b980a72564e916bc1e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50ca0f7ba418b858ddd3c054b3bf08d33cf45fe0 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ee06154df37ca6ff50ff694a2f925f9fb06c74f2e26f36a6523986aca01f3b +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fe25524ad37987fcd5e8a517c149b221416ff73 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0262fe5ed065947849d3055c3ebd23cc75436d9e8cdb1b75af033ef9154fb704 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b43771f58c685805bf9b01acee4da9aa4a0fd661 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:663a9c37bf4941d25bfbf44479b56e7bc78a7b4aa99210b2cddc30159726070f +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7172c2cd6d2a8de6dfc0ff87828cd3ede36d862 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:597d09482ba7edef55d1e2467e09c315e31a5e11008b66ea5a0209e941a51bba +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da8fb3dc4787401b93fa751569224a347e297438 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc2090f00e2ac5bf8e5eb666a9790300868a9542430d293a341e7646ed8c962 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7db803ee77413d323438a4810ed57361e5d10ac --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa5db862ba8cdcac9ac3a891b67dde7ebd7142cd98f546067dc9d4730da105d2 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55a0f7959ca21c2ba9f4a03bda097559ff1e6d28 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba3a81d3c85c847622d405b4f3a71b0692e76d8de905d730bde35f168fd5f33 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14fd344ac81fdf4dd53077f7cfb0eee7d09590bb --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc961edafc97062497245103be726810c4fb2da26b23b77d881946a071fd1ba +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be23772d3af3955be3f5a6e7f2a5c6ba6ed6e96c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f72e8c6c034f63af75f6617212e759262a1efa79060acd2ed466e8449f1ebcd +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a28c9cc3971ab0b5b78e2221e5e7f0d9901b3faa --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fb5bd2d1d0d97a97f4c5ee19a0263bccd5b5bb2f94b468db727980c61ef5fb +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adcfd17bcd0273e2fd0c82cb92a9541b7ede0747 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13cfc3a40bc5cab959bf3ad0192ee57e3f35be78bca743a46e471e7423562b0b +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5012fb68bd2cdf916744aafdfdc3e31748ed0d1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7638e30007b46f2d6eeb3addadad9c3e1e4054893943ea87b2f799fa77abac4 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..addcf2bf6eaaa2a823dba8172c5debe80c267ae8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff265ae0047fc613d571ca733ea39667109fd0467dd23447a6759d1d5da20ac9 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d5e3986ddd7074d97a14d468f067811f45efd9a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc093bb881e46304eb3ac5b5ead90418f52b3396d43b875bd8b3c8a5756383f +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e201402bb36891e48e2b7110304ad87df61a6070 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b40f5e8ba2f299f4eda41d6964ef1f313f53d1f8f687ebd6938ce3242fb4c3 +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..949ad9cc5a30e1bea819344d19b7dda3469a8731 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.87792969, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20", + "epoch": 0.5263157894736842, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 10539080777728.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a16abab1f1983aab2452611875f68681c57409b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12862d247e41fde2c4bc898b84fcf721f1f232e305e1ca489f2123d16483d53a +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c3740a343a544a0fb7d3118e0b3371e8d94e7cb --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774495a56f1a6243672132f4561f9c7629b08c7075649ab7ffa67ed97cfa28a4 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ede444a8d679bb983500d1c011f72756fb3ac6c2 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:755c7790f52ed93bfd4c3dc1db11c273ef9aa5c3e4afa49b8b8e467e5428954e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e314874f1d7cc9497633e06eae673bcecab0747 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45d88bb9d7b0abe4ac6d8bdfc95846bfb15db6b288d438060cf002c096d06b87 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab1ebd4e50d3e32cb2239898e81b0139a34f3eb2 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:154771a93eddebd8a91f23b7b513ed7ad40d300f294ccdc2933eb30dea817cab +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5746a2188f0acec89edc5b7792964e38f75f76b6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ea1088d0a0b3801471576e58c08d296343ceeb6e9586a183a2b33536e25e00 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be8e7adfb128eedad22988d3f2c0853c7b92a6d2 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f29e664e3711b079f4f7493e11faa8f7537181a15aa70466bec4ea485c9cf3 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..261729b20f2a68bfd86966e3e0df4c5a4d6ed4ed --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86aad9d3e70c74a0262d991edcf4f3a75e25eba114f3a00d2b15192d7c5e7823 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d04f09160c975fe10c20ec19afb27e558e9f767 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ceaf48931b18002ad95f5e273bdfc389059c8556c5ad1e206b1b7b28e352eb +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..368a915ecb07e992bc878263e6a4f701dcd54371 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d69802b4f88967794748b35a4f7ca0893ff29d1c61fda0c6a591c465abadd24 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..048da81d675924294abf56e83a1f7fffa58c57f8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c7854ce2a8dbb1192befed3b5c2b9ed97a600a6ee83b88fa83a110ba6f88cf +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19c38c1a76d0df78c2c5ae50e1467b80917798f8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3797d4722b98718c1cffdac68bccb307adff9931f949d69a9e1bce1e5177f78b +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d934cc6c58ca2991b49d5f242c478c50a65956c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bab01d185f8557caadb70aa1227dc017b2a731ba2404af2a53e4c7714e644c7 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc3233569bc09efff1da61a02e95fcba6eff7350 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1411aee792a258b18e1b908d13d265d6d23b4056c935b22721549bd72d8d295c +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e227a267268ff7a7c2d3343d15728854d9653842 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:789d5c683cac5ac27a5f6b272b9685f12421bdd74e2de4647643b296d514365a +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..313788aae032b6822cd30db76345590d110e61f1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2df67e66fe8b74efdbeaf7ad01d8c8c0efd99058fa0fc68f9a7eda26383a62ce +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b6300f63ee6e8723736bc1a505b827d6bcc6ee3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd9ad344977cc5293ea6f40f528f18c3083025f4df9ea19333677905f8e7cc7 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5b7e2ec90fdb824c8932464c1d9068330655a7 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d8d7722fc72cab6d492b76cb99c8177dcc47544 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c9f84eff30cfa9ea1feedaf262d61fb12e4cba7 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eebfb928f8e91eff0ea1645a20b5aa4465c705b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0866030a266c6d003cc378a9418a723f69e8ab99 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..554638d77107f832d7aa51c61645ee2d6c48a36d --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..964331b65172a1bcac03e4673415fa787f724268 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd4754d65217d0f9d1f2d3334397df7a8a079652 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d2abd2d1feb7e9804d318f0409ab46d47248ca5 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0cfcde03016592eed8191f897341f523bbb99d728821c8afed66eae5a64729 +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e91790f80153da893b1a6aed6d8852510d6d913c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.58398438, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40", + "epoch": 1.0526315789473684, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.750705999680513, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": 0.0028198242653161287, + "logits/rejected": 0.02851562574505806, + "logps/chosen": -708.7999877929688, + "logps/rejected": -666.0, + "loss": 1.101220703125, + "memory(GiB)": 36.36, + "nll_loss": 0.8804687261581421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 6.09375, + "rewards/margins": 3.2359375953674316, + "rewards/rejected": 2.8531250953674316, + "step": 25, + "train_speed(iter/s)": 0.402214 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1620640103930189, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.14511719346046448, + "logits/rejected": -0.010668945498764515, + "logps/chosen": -493.20001220703125, + "logps/rejected": -559.5999755859375, + "loss": 0.742919921875, + "memory(GiB)": 36.36, + "nll_loss": 0.717968761920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.668749809265137, + "rewards/margins": 4.993750095367432, + "rewards/rejected": 2.6703124046325684, + "step": 30, + "train_speed(iter/s)": 0.409863 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.5082652670453853, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.0003417968691792339, + "logits/rejected": -0.09726562350988388, + "logps/chosen": -783.2000122070312, + "logps/rejected": -649.2000122070312, + "loss": 0.8309814453125, + "memory(GiB)": 36.36, + "nll_loss": 0.778124988079071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.25, + "rewards/margins": 5.650000095367432, + "rewards/rejected": 2.6031250953674316, + "step": 35, + "train_speed(iter/s)": 0.415557 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1139559192769446, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": 0.05820312350988388, + "logits/rejected": -0.06640625, + "logps/chosen": -599.5999755859375, + "logps/rejected": -630.7999877929688, + "loss": 0.725, + "memory(GiB)": 36.36, + "nll_loss": 0.782031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.856249809265137, + "rewards/margins": 7.199999809265137, + "rewards/rejected": 1.6640625, + "step": 40, + "train_speed(iter/s)": 0.419666 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.73828125, + "eval_logits/rejected": 0.52734375, + "eval_logps/chosen": -320.0, + "eval_logps/rejected": -1208.0, + "eval_loss": 0.583984375, + "eval_nll_loss": 0.57421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 6.0, + "eval_rewards/rejected": 4.0625, + "eval_runtime": 1.0568, + "eval_samples_per_second": 3.785, + "eval_steps_per_second": 0.946, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 21948037758976.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7fceaa396ad9c319d1f6a4e18fc6398705de417e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cbea577e7ca416abde051dccdea26a9cd3f4b90fe07d8ed41c9c7ef4e6f92d +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4be59301d67488e9bf5acf9703915c0039ebbc29 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935fee2c1a0c422df5a45f7c2ad13e149a0ebd07ac23eb76095d7f41efc22583 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02ea6b0dac7b31c49b93d2380a61b14c16a64736 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf16af43866b9fe2957e33eddd33f0c2551014816a36f8cc884ae3a4005df11 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14a6ea0248d966f17ae798807f7eadf1536e4d98 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19a7d28c5759e55141837606bd6b2205e1acb47376e0dd59711ac5c30a5328a +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a654a3fd5152c3b6ae78d7520d6d76e4281b6597 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a47c71e9a67099762f3537a1048e6d4b7e8020690da3e687b804e7123ceb529d +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32dee7c3543a469ef40229ebc65ead00be428ddd --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b365228d9d48fad98c90b876330fc38faeb4ea54b67b954de19ec4505837d6b7 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bdb67a16e052a2f9c99d72b7bd75e4c7f739e96 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cf35d2a0e8c20df3659d619a37859582e69c5f9c07d0dfe34a8205e1becbc0f +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..833238634fd4528ef79f14cfedea7406eedf5247 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f308d24dcaaeb28b3aba133b082cff33f8b2f36d9b40b9210e18ac6e8b18455e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a03dafed90ac19507e23263f6db7198978c961a2 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6cdbf667f8a52efc65b15c0a3d5d635ec9d1021218d01599ebdef687abf186 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f854fbe908bf6eaf1df688c5d44efd4141ff337 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc5eedf95ac83f25c3057428c5763fee3753836d9074bec1caa7d82e6aff8d79 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a61954f0e8f01787bc0449a7b43ec1232ad0c6c3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef77f9fa140f9c69a45c3603be6f1ca66fbdfc662980d7126189a4aff660fc1 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de7df6d38b9417f9782949e569dac893cc6db1c1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0b9b0845cd7f952d5bd0d89085230c124ac1751e7957bb680495a6263028abd +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b93b678af6747a92710154e14ed6bffea2f81479 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dadcaa1aafc29067c123749b62942515dc808b9d598b0c80d4344b8b477cae3 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..466e1fd413dec451a60f974d5f31433a9c88f2a4 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b15f136ee673fcd896ea8c2e42cf9c40fff72fa56d59508559db6485086dc9b +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aebe21a53703e032ab5cb94c457af8e89f45a178 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2de6cd6189fa6c1fc103f3693bc55bcd7baf02a9e385aff87ab8c6ca292037be +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03b4f4317c43166b32353f8d84a29dc694238649 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03a9ea5d95415940aafdbca7e7cb1edcd890440fe792f2583b7f4a1b6eea921 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3eb01e183b2807c571a1c7d21ff010c4a5836e1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1217fbadda5982ea08c3c898d19f9eb4ccc643d641c76eda44fcec38c99f375b +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24bb2a6ed10249209e94b434ed554cac856d563 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3a6465b9cb557a3a4db2097cdb877b1c624f5f645895d0cd27357a78258aa4 +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5d1959133640db0c5e22c447a0b3bfb38b3a4b3 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.55859375, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60", + "epoch": 1.5789473684210527, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.750705999680513, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": 0.0028198242653161287, + "logits/rejected": 0.02851562574505806, + "logps/chosen": -708.7999877929688, + "logps/rejected": -666.0, + "loss": 1.101220703125, + "memory(GiB)": 36.36, + "nll_loss": 0.8804687261581421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 6.09375, + "rewards/margins": 3.2359375953674316, + "rewards/rejected": 2.8531250953674316, + "step": 25, + "train_speed(iter/s)": 0.402214 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1620640103930189, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.14511719346046448, + "logits/rejected": -0.010668945498764515, + "logps/chosen": -493.20001220703125, + "logps/rejected": -559.5999755859375, + "loss": 0.742919921875, + "memory(GiB)": 36.36, + "nll_loss": 0.717968761920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.668749809265137, + "rewards/margins": 4.993750095367432, + "rewards/rejected": 2.6703124046325684, + "step": 30, + "train_speed(iter/s)": 0.409863 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.5082652670453853, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.0003417968691792339, + "logits/rejected": -0.09726562350988388, + "logps/chosen": -783.2000122070312, + "logps/rejected": -649.2000122070312, + "loss": 0.8309814453125, + "memory(GiB)": 36.36, + "nll_loss": 0.778124988079071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.25, + "rewards/margins": 5.650000095367432, + "rewards/rejected": 2.6031250953674316, + "step": 35, + "train_speed(iter/s)": 0.415557 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1139559192769446, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": 0.05820312350988388, + "logits/rejected": -0.06640625, + "logps/chosen": -599.5999755859375, + "logps/rejected": -630.7999877929688, + "loss": 0.725, + "memory(GiB)": 36.36, + "nll_loss": 0.782031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.856249809265137, + "rewards/margins": 7.199999809265137, + "rewards/rejected": 1.6640625, + "step": 40, + "train_speed(iter/s)": 0.419666 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.73828125, + "eval_logits/rejected": 0.52734375, + "eval_logps/chosen": -320.0, + "eval_logps/rejected": -1208.0, + "eval_loss": 0.583984375, + "eval_nll_loss": 0.57421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 6.0, + "eval_rewards/rejected": 4.0625, + "eval_runtime": 1.0568, + "eval_samples_per_second": 3.785, + "eval_steps_per_second": 0.946, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.5832429010086848, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": 0.10170898586511612, + "logits/rejected": 0.05078125, + "logps/chosen": -606.7999877929688, + "logps/rejected": -670.4000244140625, + "loss": 0.6896728515625, + "memory(GiB)": 36.36, + "nll_loss": 0.6890624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.175000190734863, + "rewards/margins": 9.737500190734863, + "rewards/rejected": 0.4398437440395355, + "step": 45, + "train_speed(iter/s)": 0.41586 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.49952125769013667, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -0.04960937425494194, + "logits/rejected": -0.02602539025247097, + "logps/chosen": -610.4000244140625, + "logps/rejected": -663.5999755859375, + "loss": 0.64876708984375, + "memory(GiB)": 36.36, + "nll_loss": 0.647656261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.912500381469727, + "rewards/margins": 10.362500190734863, + "rewards/rejected": 0.546093761920929, + "step": 50, + "train_speed(iter/s)": 0.4198 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.31824407050814685, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": 0.22453613579273224, + "logits/rejected": 0.23637695610523224, + "logps/chosen": -608.4000244140625, + "logps/rejected": -621.5999755859375, + "loss": 0.68779296875, + "memory(GiB)": 36.36, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.149999618530273, + "rewards/margins": 10.612500190734863, + "rewards/rejected": 0.55908203125, + "step": 55, + "train_speed(iter/s)": 0.423222 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23207078350156118, + "learning_rate": 5e-05, + "logits/chosen": 0.3818359375, + "logits/rejected": -0.07106933742761612, + "logps/chosen": -654.0, + "logps/rejected": -466.79998779296875, + "loss": 0.577880859375, + "memory(GiB)": 36.36, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.862500190734863, + "rewards/margins": 11.5, + "rewards/rejected": 0.3648925721645355, + "step": 60, + "train_speed(iter/s)": 0.425778 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.6015625, + "eval_logits/rejected": 0.65234375, + "eval_logps/chosen": -304.0, + "eval_logps/rejected": -1192.0, + "eval_loss": 0.55859375, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": 5.34375, + "eval_runtime": 1.0788, + "eval_samples_per_second": 3.708, + "eval_steps_per_second": 0.927, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 32938830757888.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/README.md b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5978e4fb1fd6c90157286b85839ad77358f92787 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a24ef59a277a25429baa3b5582e16c9758a877af --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_model.safetensors b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bafb932c205fe9b8e7b72803ff03da095530475c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8065bebd9648d8ca234bb709a2d1de5fe236dad1a59c146fb1b1405e5e04a64 +size 18516456 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/additional_config.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/args.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..99c4b7064894f541e8747d7da5ae79faee20027e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-1.5b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-1.5b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc2f867e8db3a8882a461a9e87ba963be6610f2e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8fbb07cd28afcaa3e9b167c04bf5d8d5691d43061f49c80ac1731d18941322 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4649a242508723327979b327e3112de882cf8e27 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f289234d7e4f0dad65f53690fa1a1c5f4c079aaa827189f0b514074c61476ab2 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c0a623aeae4839f6ad8898e9dbc458a69c91e3f --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:357c8ee118a725c2876d7f236dfdde84bccca7946c480a4f4e27a7e88e492594 +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7affa49b5e63e340b09644685161befff299d2ba --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7877a37b00c79db4aef8fc9bd0a363ecd3359d6948fe2ade3d1e97e6c48ecc +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ba78eb704f0bbe079e7ac30d48b15d60eb871fe --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e392b2938667088f287845466c80e8269295c99f958611916d991e78f2885df +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b96e1d0f01251957da63245a6916035d4cbcb61 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81d648c935c1e7952227929843aa524f1614589956ad8a49bf6c38d23330cf7e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2617abf2a3a418e57ea93c745346d04dce258881 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d05d5907667a8b693b2d45eeed6d8187e8d87076d069cda1b7e39637a2c3f58f +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e7aa2537cc833eb0827800a959684115a1ec629 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59476e1c29c42d71c3bca8b58024718e8cb8bdc7017e9be0179afa249f4dcd6e +size 13852592 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b7047855296a0b527211a7d31aca153a2d1caab --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:434a8540c81b6f48abbadf4d8026c1eba121de5ecfa3a2012ef5a9ab22a4aa77 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8aed8798bac2279eb208a53d7f4d1c3f1caec0ec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c950354ea7b3ebbecce5a163f3ba2ed4180cf0ee60c1e33535d12c5287ee238 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f06f0c58bf49cf716cdb4f9588c6fd7e9a8f24e1 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c412e4238801ff9647200a41201c11a67095c9eb798e6475b61d124991e8ae6 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..286a26d00d3219841a4ff3bd33f602e82ac05d7d --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a66d2e5ae48ad1f9e4de2a31ea8a9bf9f4d4ec2bd537602297a2107ad5ed5c +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e72128f384516eaf8343dbfec6261edccf930372 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c27d6be3391eca7e8850eb5438840a973234ce1a697bba0cfa2dd73b7dbb2166 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..569a6c665cee58ba1032f9f612629232281bc380 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ea4809042822d7b331dd2b21876ceb188dd0c1facc50577d82b50c6645b5ac +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f04e37e0832979f59b41182d75db32d951364b3d --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6af3692f3ce5d0c2d5358103b25848d0d109ad7e3292bc8355f86aaa7ce2b5b2 +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d02312eb6b4724c6a6e05ed2383c61bfc21e4a12 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dfba0931b421687e6566aa116cd057b4ad4c9723d6140d39f0321c3b1b6b9fd +size 388374 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/latest b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_0.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_1.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_2.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_3.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_4.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_5.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_6.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_7.pth b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/scheduler.pt b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf96d6803aea265d756d902db3c4cc2386f9742 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90524bcdb94734ac7120e4205110f14662bff8cee00eed50355875dcdc538029 +size 1064 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/trainer_state.json b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e2568d1328992692944cd72a51753edcc2b3ae4 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.55859375, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-60", + "epoch": 2.1052631578947367, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 12.157362496957303, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.6875, + "logits/rejected": 0.1416015625, + "logps/chosen": -768.0, + "logps/rejected": -284.0, + "loss": 1.384765625, + "memory(GiB)": 3.36, + "nll_loss": 0.69140625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.136299 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.539150660119484, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": 0.068756103515625, + "logits/rejected": -0.03271484375, + "logps/chosen": -809.5, + "logps/rejected": -460.5, + "loss": 2.017822265625, + "memory(GiB)": 15.74, + "nll_loss": 1.37890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05777740478515625, + "rewards/margins": 0.1044921875, + "rewards/rejected": -0.04648590087890625, + "step": 5, + "train_speed(iter/s)": 0.30928 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.599031383319412, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.18076172471046448, + "logits/rejected": 0.168701171875, + "logps/chosen": -724.0, + "logps/rejected": -717.0, + "loss": 2.158203125, + "memory(GiB)": 36.36, + "nll_loss": 1.5546875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.964062511920929, + "rewards/margins": 0.3333984315395355, + "rewards/rejected": 0.6314452886581421, + "step": 10, + "train_speed(iter/s)": 0.364139 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.41127146668536, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": 0.3662109375, + "logits/rejected": 0.2630859315395355, + "logps/chosen": -729.5999755859375, + "logps/rejected": -533.5999755859375, + "loss": 1.2904296875, + "memory(GiB)": 36.36, + "nll_loss": 0.981249988079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.487499952316284, + "rewards/margins": 1.62890625, + "rewards/rejected": 0.8617187738418579, + "step": 15, + "train_speed(iter/s)": 0.391058 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 15.35539910021312, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": 0.12734374403953552, + "logits/rejected": 0.15620116889476776, + "logps/chosen": -484.3999938964844, + "logps/rejected": -604.7999877929688, + "loss": 1.4791015625, + "memory(GiB)": 36.36, + "nll_loss": 1.1945312023162842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.234375, + "rewards/margins": 1.860937476158142, + "rewards/rejected": 2.3671875, + "step": 20, + "train_speed(iter/s)": 0.404763 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": 0.58203125, + "eval_logps/chosen": -346.0, + "eval_logps/rejected": -1200.0, + "eval_loss": 0.8779296875, + "eval_nll_loss": 0.66015625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 7.46875, + "eval_rewards/margins": 2.515625, + "eval_rewards/rejected": 4.9375, + "eval_runtime": 1.0501, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.952, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 9.750705999680513, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": 0.0028198242653161287, + "logits/rejected": 0.02851562574505806, + "logps/chosen": -708.7999877929688, + "logps/rejected": -666.0, + "loss": 1.101220703125, + "memory(GiB)": 36.36, + "nll_loss": 0.8804687261581421, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 6.09375, + "rewards/margins": 3.2359375953674316, + "rewards/rejected": 2.8531250953674316, + "step": 25, + "train_speed(iter/s)": 0.402214 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.1620640103930189, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.14511719346046448, + "logits/rejected": -0.010668945498764515, + "logps/chosen": -493.20001220703125, + "logps/rejected": -559.5999755859375, + "loss": 0.742919921875, + "memory(GiB)": 36.36, + "nll_loss": 0.717968761920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.668749809265137, + "rewards/margins": 4.993750095367432, + "rewards/rejected": 2.6703124046325684, + "step": 30, + "train_speed(iter/s)": 0.409863 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.5082652670453853, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.0003417968691792339, + "logits/rejected": -0.09726562350988388, + "logps/chosen": -783.2000122070312, + "logps/rejected": -649.2000122070312, + "loss": 0.8309814453125, + "memory(GiB)": 36.36, + "nll_loss": 0.778124988079071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.25, + "rewards/margins": 5.650000095367432, + "rewards/rejected": 2.6031250953674316, + "step": 35, + "train_speed(iter/s)": 0.415557 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1139559192769446, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": 0.05820312350988388, + "logits/rejected": -0.06640625, + "logps/chosen": -599.5999755859375, + "logps/rejected": -630.7999877929688, + "loss": 0.725, + "memory(GiB)": 36.36, + "nll_loss": 0.782031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.856249809265137, + "rewards/margins": 7.199999809265137, + "rewards/rejected": 1.6640625, + "step": 40, + "train_speed(iter/s)": 0.419666 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.73828125, + "eval_logits/rejected": 0.52734375, + "eval_logps/chosen": -320.0, + "eval_logps/rejected": -1208.0, + "eval_loss": 0.583984375, + "eval_nll_loss": 0.57421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.0625, + "eval_rewards/margins": 6.0, + "eval_rewards/rejected": 4.0625, + "eval_runtime": 1.0568, + "eval_samples_per_second": 3.785, + "eval_steps_per_second": 0.946, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.5832429010086848, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": 0.10170898586511612, + "logits/rejected": 0.05078125, + "logps/chosen": -606.7999877929688, + "logps/rejected": -670.4000244140625, + "loss": 0.6896728515625, + "memory(GiB)": 36.36, + "nll_loss": 0.6890624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.175000190734863, + "rewards/margins": 9.737500190734863, + "rewards/rejected": 0.4398437440395355, + "step": 45, + "train_speed(iter/s)": 0.41586 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.49952125769013667, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -0.04960937425494194, + "logits/rejected": -0.02602539025247097, + "logps/chosen": -610.4000244140625, + "logps/rejected": -663.5999755859375, + "loss": 0.64876708984375, + "memory(GiB)": 36.36, + "nll_loss": 0.647656261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.912500381469727, + "rewards/margins": 10.362500190734863, + "rewards/rejected": 0.546093761920929, + "step": 50, + "train_speed(iter/s)": 0.4198 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.31824407050814685, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": 0.22453613579273224, + "logits/rejected": 0.23637695610523224, + "logps/chosen": -608.4000244140625, + "logps/rejected": -621.5999755859375, + "loss": 0.68779296875, + "memory(GiB)": 36.36, + "nll_loss": 0.6875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.149999618530273, + "rewards/margins": 10.612500190734863, + "rewards/rejected": 0.55908203125, + "step": 55, + "train_speed(iter/s)": 0.423222 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23207078350156118, + "learning_rate": 5e-05, + "logits/chosen": 0.3818359375, + "logits/rejected": -0.07106933742761612, + "logps/chosen": -654.0, + "logps/rejected": -466.79998779296875, + "loss": 0.577880859375, + "memory(GiB)": 36.36, + "nll_loss": 0.578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.862500190734863, + "rewards/margins": 11.5, + "rewards/rejected": 0.3648925721645355, + "step": 60, + "train_speed(iter/s)": 0.425778 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.6015625, + "eval_logits/rejected": 0.65234375, + "eval_logps/chosen": -304.0, + "eval_logps/rejected": -1192.0, + "eval_loss": 0.55859375, + "eval_nll_loss": 0.53515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.625, + "eval_rewards/margins": 6.3125, + "eval_rewards/rejected": 5.34375, + "eval_runtime": 1.0788, + "eval_samples_per_second": 3.708, + "eval_steps_per_second": 0.927, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6795800719500261, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.07011719048023224, + "logits/rejected": 0.01972656324505806, + "logps/chosen": -590.4000244140625, + "logps/rejected": -555.2000122070312, + "loss": 0.60504150390625, + "memory(GiB)": 36.36, + "nll_loss": 0.604296863079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 10.949999809265137, + "rewards/rejected": 0.851757824420929, + "step": 65, + "train_speed(iter/s)": 0.423328 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.580148749174503, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": 0.0538330078125, + "logits/rejected": 0.05624999850988388, + "logps/chosen": -501.79998779296875, + "logps/rejected": -551.4000244140625, + "loss": 0.6089111328125, + "memory(GiB)": 36.36, + "nll_loss": 0.6078125238418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.725000381469727, + "rewards/margins": 10.162500381469727, + "rewards/rejected": 0.565625011920929, + "step": 70, + "train_speed(iter/s)": 0.425442 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.6687994812508333, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": 0.16015625, + "logits/rejected": 0.23457030951976776, + "logps/chosen": -650.0, + "logps/rejected": -695.2000122070312, + "loss": 0.688037109375, + "memory(GiB)": 36.36, + "nll_loss": 0.688281238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.574999809265137, + "rewards/margins": 12.662500381469727, + "rewards/rejected": -0.09501953423023224, + "step": 75, + "train_speed(iter/s)": 0.427366 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.779151521456302, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": 0.17226561903953552, + "logits/rejected": 0.12241210788488388, + "logps/chosen": -528.7999877929688, + "logps/rejected": -652.7999877929688, + "loss": 0.6132568359375, + "memory(GiB)": 36.36, + "nll_loss": 0.602343738079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.824999809265137, + "rewards/margins": 12.537500381469727, + "rewards/rejected": -0.6976562738418579, + "step": 80, + "train_speed(iter/s)": 0.428029 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.53125, + "eval_logits/rejected": 0.6875, + "eval_logps/chosen": -298.0, + "eval_logps/rejected": -1184.0, + "eval_loss": 0.56982421875, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 12.1875, + "eval_rewards/margins": 6.03125, + "eval_rewards/rejected": 6.15625, + "eval_runtime": 1.013, + "eval_samples_per_second": 3.949, + "eval_steps_per_second": 0.987, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 44070743474176.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/training_args.bin b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..adcfb0815e4cd34d58e6017e0fcc324580304fec --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd8e548d0ee82a07e59ed98ea8de6a14052438085f0628cc1e4751979417d39 +size 9016 diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/zero_to_fp32.py b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..ac94baaa78e5e3f9f239e40492b4460a3834856f Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ecec949ad529f919b090b0b97b7dc6ee0aaeb4ae Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logits_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..32df0241c8e24eb911694d80d5b32024f695d7bf Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..16d1315dfc4cbcc73ce450eb87e5993c9c472389 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_logps_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_loss.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..0710496ccf13b21763836f9c6a10d903fc696bac Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_loss.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_nll_loss.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b7b954b2f06f927a67e2f11ee0fc34a8b722dfe8 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_nll_loss.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_accuracies.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..b359853ac747638a7e44efa30e8bcf81f3befcfc Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_accuracies.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..41d5f4e826ad5ec1af0621fab05a9796cdd517ec Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_margins.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..ef5d291bee063e7de18480178ce7555b0a60ea58 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_margins.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..343cae938706b81256fb2e3265d9acc8b71f8a0c Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_rewards_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_runtime.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..39d55eb0ff2a02c7b68d35e13a1784e7adbac5fc Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_runtime.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_samples_per_second.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..239b39569ef6a466d12236ea3db8d75dfe6e610c Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_samples_per_second.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_steps_per_second.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..c2623c1a7a7820db9baabbd3832b5953dbd29e81 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/eval_steps_per_second.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_epoch.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..c152c9bbb1100d680125e899d3d0da3db98dc221 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_epoch.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_grad_norm.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..339d672b9a585776c4540ef31f913baf0f7f9b81 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_grad_norm.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_learning_rate.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..e35e42c173f451f9666ba448872c615528441c9e Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_learning_rate.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..eacd64c94e2fc9b0fff41efab5cdd3f112811f44 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..260d22e4a0cab44b385910682104a8465bcf3433 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logits_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..690f66a0918e3c8432675d64a0b31354cdff94c9 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c889bf45b12b64de0394f57c6eb9cce13e50daae Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_logps_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_loss.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..4d946bd1b5fe4d313981a176c08790e77288dde0 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_loss.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_memory(GiB).png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..42e156d098510b7cb58867eebe271b7932275137 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_memory(GiB).png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_nll_loss.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..71f21ce63981f5f333e3630c72719af2ef324860 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_nll_loss.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_accuracies.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..93c0d599f0927ddb6c32046e03a184b8c24664a3 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_accuracies.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_chosen.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..4300da6004341de8d0750e1c1fc8dba30646f9b6 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_chosen.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_margins.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..6b8c26028c9e9f5f3d6c6d2be61a87cb39d1dc49 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_margins.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_rejected.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f63605f3c2bb1a32f5b4c7cac03f937c3eeda56b Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_rewards_rejected.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_total_flos.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..a40302bec64f19d6acfbbd6f66768543382545f0 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_total_flos.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_loss.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a9ebeeaffda784f58cfbc10f0e36001affc593ce Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_loss.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_runtime.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..1d2aee10b721e90b1874ed311d03a81d20244033 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_runtime.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_samples_per_second.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..9979070b99b82bc0dec2cffbb35ab2fd99a7bfeb Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_samples_per_second.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_speed(iter_s).png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..c92c3d4359082181b876230c2882a91e42ee8c2a Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_speed(iter_s).png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_steps_per_second.png b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..1b4384340a067f4e104df788ea1ee72100779ff9 Binary files /dev/null and b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/images/train_train_steps_per_second.png differ diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/logging.jsonl b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f6e3379ab3b364b8f83c9baaf394e6e5abcb6bd8 --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/logging.jsonl @@ -0,0 +1,31 @@ +{"loss": 1.38476562, "grad_norm": 12.1573625, "learning_rate": 1.667e-05, "memory(GiB)": 3.36, "train_speed(iter/s)": 0.136299, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -768.0, "logps/rejected": -284.0, "logits/chosen": 0.6875, "logits/rejected": 0.14160156, "nll_loss": 0.69140625, "epoch": 0.02631579, "global_step/max_steps": "1/114", "percentage": "0.88%", "elapsed_time": "4s", "remaining_time": "8m 32s"} +{"loss": 2.01782227, "grad_norm": 13.53915066, "learning_rate": 8.333e-05, "memory(GiB)": 15.74, "train_speed(iter/s)": 0.30928, "rewards/chosen": 0.0577774, "rewards/rejected": -0.0464859, "rewards/accuracies": 0.4375, "rewards/margins": 0.10449219, "logps/chosen": -809.5, "logps/rejected": -460.5, "logits/chosen": 0.0687561, "logits/rejected": -0.03271484, "nll_loss": 1.37890625, "epoch": 0.13157895, "global_step/max_steps": "5/114", "percentage": "4.39%", "elapsed_time": "13s", "remaining_time": "4m 51s"} +{"loss": 2.15820312, "grad_norm": 10.59903138, "learning_rate": 9.966e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.364139, "rewards/chosen": 0.96406251, "rewards/rejected": 0.63144529, "rewards/accuracies": 0.69999999, "rewards/margins": 0.33339843, "logps/chosen": -724.0, "logps/rejected": -717.0, "logits/chosen": 0.18076172, "logits/rejected": 0.16870117, "nll_loss": 1.5546875, "epoch": 0.26315789, "global_step/max_steps": "10/114", "percentage": "8.77%", "elapsed_time": "24s", "remaining_time": "4m 16s"} +{"loss": 1.29042969, "grad_norm": 3.41127147, "learning_rate": 9.83e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.391058, "rewards/chosen": 2.48749995, "rewards/rejected": 0.86171877, "rewards/accuracies": 0.875, "rewards/margins": 1.62890625, "logps/chosen": -729.59997559, "logps/rejected": -533.59997559, "logits/chosen": 0.36621094, "logits/rejected": 0.26308593, "nll_loss": 0.98124999, "epoch": 0.39473684, "global_step/max_steps": "15/114", "percentage": "13.16%", "elapsed_time": "35s", "remaining_time": "3m 54s"} +{"loss": 1.47910156, "grad_norm": 15.3553991, "learning_rate": 9.591e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.404763, "rewards/chosen": 4.234375, "rewards/rejected": 2.3671875, "rewards/accuracies": 0.89999998, "rewards/margins": 1.86093748, "logps/chosen": -484.3999939, "logps/rejected": -604.79998779, "logits/chosen": 0.12734374, "logits/rejected": 0.15620117, "nll_loss": 1.1945312, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "46s", "remaining_time": "3m 39s"} +{"eval_loss": 0.87792969, "eval_runtime": 1.0501, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.952, "eval_rewards/chosen": 7.46875, "eval_rewards/rejected": 4.9375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 2.515625, "eval_logps/chosen": -346.0, "eval_logps/rejected": -1200.0, "eval_logits/chosen": -0.59765625, "eval_logits/rejected": 0.58203125, "eval_nll_loss": 0.66015625, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "47s", "remaining_time": "3m 44s"} +{"loss": 1.1012207, "grad_norm": 9.750706, "learning_rate": 9.256e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.402214, "rewards/chosen": 6.09375, "rewards/rejected": 2.8531251, "rewards/accuracies": 0.92500001, "rewards/margins": 3.2359376, "logps/chosen": -708.79998779, "logps/rejected": -666.0, "logits/chosen": 0.00281982, "logits/rejected": 0.02851563, "nll_loss": 0.88046873, "epoch": 0.65789474, "global_step/max_steps": "25/114", "percentage": "21.93%", "elapsed_time": "59s", "remaining_time": "3m 31s"} +{"loss": 0.74291992, "grad_norm": 1.16206401, "learning_rate": 8.83e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.409863, "rewards/chosen": 7.66874981, "rewards/rejected": 2.6703124, "rewards/accuracies": 1.0, "rewards/margins": 4.9937501, "logps/chosen": -493.20001221, "logps/rejected": -559.59997559, "logits/chosen": -0.14511719, "logits/rejected": -0.01066895, "nll_loss": 0.71796876, "epoch": 0.78947368, "global_step/max_steps": "30/114", "percentage": "26.32%", "elapsed_time": "1m 10s", "remaining_time": "3m 17s"} +{"loss": 0.83098145, "grad_norm": 1.50826527, "learning_rate": 8.324e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.415557, "rewards/chosen": 8.25, "rewards/rejected": 2.6031251, "rewards/accuracies": 0.97500002, "rewards/margins": 5.6500001, "logps/chosen": -783.20001221, "logps/rejected": -649.20001221, "logits/chosen": -0.0003418, "logits/rejected": -0.09726562, "nll_loss": 0.77812499, "epoch": 0.92105263, "global_step/max_steps": "35/114", "percentage": "30.70%", "elapsed_time": "1m 21s", "remaining_time": "3m 3s"} +{"loss": 0.725, "grad_norm": 1.11395592, "learning_rate": 7.748e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.419666, "rewards/chosen": 8.85624981, "rewards/rejected": 1.6640625, "rewards/accuracies": 1.0, "rewards/margins": 7.19999981, "logps/chosen": -599.59997559, "logps/rejected": -630.79998779, "logits/chosen": 0.05820312, "logits/rejected": -0.06640625, "nll_loss": 0.78203124, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "1m 32s", "remaining_time": "2m 51s"} +{"eval_loss": 0.58398438, "eval_runtime": 1.0568, "eval_samples_per_second": 3.785, "eval_steps_per_second": 0.946, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": 4.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.0, "eval_logps/chosen": -320.0, "eval_logps/rejected": -1208.0, "eval_logits/chosen": -0.73828125, "eval_logits/rejected": 0.52734375, "eval_nll_loss": 0.57421875, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "1m 33s", "remaining_time": "2m 53s"} +{"loss": 0.68967285, "grad_norm": 0.5832429, "learning_rate": 7.113e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.41586, "rewards/chosen": 10.17500019, "rewards/rejected": 0.43984374, "rewards/accuracies": 1.0, "rewards/margins": 9.73750019, "logps/chosen": -606.79998779, "logps/rejected": -670.40002441, "logits/chosen": 0.10170899, "logits/rejected": 0.05078125, "nll_loss": 0.68906248, "epoch": 1.18421053, "global_step/max_steps": "45/114", "percentage": "39.47%", "elapsed_time": "1m 45s", "remaining_time": "2m 41s"} +{"loss": 0.64876709, "grad_norm": 0.49952126, "learning_rate": 6.434e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.4198, "rewards/chosen": 10.91250038, "rewards/rejected": 0.54609376, "rewards/accuracies": 1.0, "rewards/margins": 10.36250019, "logps/chosen": -610.40002441, "logps/rejected": -663.59997559, "logits/chosen": -0.04960937, "logits/rejected": -0.02602539, "nll_loss": 0.64765626, "epoch": 1.31578947, "global_step/max_steps": "50/114", "percentage": "43.86%", "elapsed_time": "1m 56s", "remaining_time": "2m 28s"} +{"loss": 0.68779297, "grad_norm": 0.31824407, "learning_rate": 5.725e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.423222, "rewards/chosen": 11.14999962, "rewards/rejected": 0.55908203, "rewards/accuracies": 1.0, "rewards/margins": 10.61250019, "logps/chosen": -608.40002441, "logps/rejected": -621.59997559, "logits/chosen": 0.22453614, "logits/rejected": 0.23637696, "nll_loss": 0.6875, "epoch": 1.44736842, "global_step/max_steps": "55/114", "percentage": "48.25%", "elapsed_time": "2m 7s", "remaining_time": "2m 16s"} +{"loss": 0.57788086, "grad_norm": 0.23207078, "learning_rate": 5e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425778, "rewards/chosen": 11.86250019, "rewards/rejected": 0.36489257, "rewards/accuracies": 1.0, "rewards/margins": 11.5, "logps/chosen": -654.0, "logps/rejected": -466.79998779, "logits/chosen": 0.38183594, "logits/rejected": -0.07106934, "nll_loss": 0.578125, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "2m 18s", "remaining_time": "2m 4s"} +{"eval_loss": 0.55859375, "eval_runtime": 1.0788, "eval_samples_per_second": 3.708, "eval_steps_per_second": 0.927, "eval_rewards/chosen": 11.625, "eval_rewards/rejected": 5.34375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/chosen": -304.0, "eval_logps/rejected": -1192.0, "eval_logits/chosen": -0.6015625, "eval_logits/rejected": 0.65234375, "eval_nll_loss": 0.53515625, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "2m 19s", "remaining_time": "2m 5s"} +{"loss": 0.6050415, "grad_norm": 0.67958007, "learning_rate": 4.275e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.423328, "rewards/chosen": 11.80000019, "rewards/rejected": 0.85175782, "rewards/accuracies": 1.0, "rewards/margins": 10.94999981, "logps/chosen": -590.40002441, "logps/rejected": -555.20001221, "logits/chosen": -0.07011719, "logits/rejected": 0.01972656, "nll_loss": 0.60429686, "epoch": 1.71052632, "global_step/max_steps": "65/114", "percentage": "57.02%", "elapsed_time": "2m 30s", "remaining_time": "1m 53s"} +{"loss": 0.60891113, "grad_norm": 0.58014875, "learning_rate": 3.566e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425442, "rewards/chosen": 10.72500038, "rewards/rejected": 0.56562501, "rewards/accuracies": 1.0, "rewards/margins": 10.16250038, "logps/chosen": -501.79998779, "logps/rejected": -551.40002441, "logits/chosen": 0.05383301, "logits/rejected": 0.05625, "nll_loss": 0.60781252, "epoch": 1.84210526, "global_step/max_steps": "70/114", "percentage": "61.40%", "elapsed_time": "2m 41s", "remaining_time": "1m 41s"} +{"loss": 0.68803711, "grad_norm": 0.66879948, "learning_rate": 2.887e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.427366, "rewards/chosen": 12.57499981, "rewards/rejected": -0.09501953, "rewards/accuracies": 1.0, "rewards/margins": 12.66250038, "logps/chosen": -650.0, "logps/rejected": -695.20001221, "logits/chosen": 0.16015625, "logits/rejected": 0.23457031, "nll_loss": 0.68828124, "epoch": 1.97368421, "global_step/max_steps": "75/114", "percentage": "65.79%", "elapsed_time": "2m 52s", "remaining_time": "1m 29s"} +{"loss": 0.61325684, "grad_norm": 0.77915152, "learning_rate": 2.252e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.428029, "rewards/chosen": 11.82499981, "rewards/rejected": -0.69765627, "rewards/accuracies": 1.0, "rewards/margins": 12.53750038, "logps/chosen": -528.79998779, "logps/rejected": -652.79998779, "logits/chosen": 0.17226562, "logits/rejected": 0.12241211, "nll_loss": 0.60234374, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "3m 4s", "remaining_time": "1m 18s"} +{"eval_loss": 0.56982422, "eval_runtime": 1.013, "eval_samples_per_second": 3.949, "eval_steps_per_second": 0.987, "eval_rewards/chosen": 12.1875, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.03125, "eval_logps/chosen": -298.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.53125, "eval_logits/rejected": 0.6875, "eval_nll_loss": 0.5234375, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "3m 5s", "remaining_time": "1m 18s"} +{"loss": 0.6223999, "grad_norm": 0.70986134, "learning_rate": 1.676e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425977, "rewards/chosen": 12.86250019, "rewards/rejected": 1.65937495, "rewards/accuracies": 1.0, "rewards/margins": 11.19999981, "logps/chosen": -527.20001221, "logps/rejected": -765.59997559, "logits/chosen": 0.06606445, "logits/rejected": 0.20031738, "nll_loss": 0.62226564, "epoch": 2.23684211, "global_step/max_steps": "85/114", "percentage": "74.56%", "elapsed_time": "3m 16s", "remaining_time": "1m 7s"} +{"loss": 0.61772461, "grad_norm": 0.37164934, "learning_rate": 1.17e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.42749, "rewards/chosen": 12.51249981, "rewards/rejected": 0.37109375, "rewards/accuracies": 1.0, "rewards/margins": 12.14999962, "logps/chosen": -634.40002441, "logps/rejected": -591.59997559, "logits/chosen": 0.25820312, "logits/rejected": 0.01315918, "nll_loss": 0.61679685, "epoch": 2.36842105, "global_step/max_steps": "90/114", "percentage": "78.95%", "elapsed_time": "3m 27s", "remaining_time": "55s"} +{"loss": 0.70126953, "grad_norm": 0.26676014, "learning_rate": 7.44e-06, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.428148, "rewards/chosen": 13.4375, "rewards/rejected": 1.15156245, "rewards/accuracies": 1.0, "rewards/margins": 12.28750038, "logps/chosen": -699.20001221, "logps/rejected": -696.79998779, "logits/chosen": 0.28183594, "logits/rejected": 0.36914062, "nll_loss": 0.70156252, "epoch": 2.5, "global_step/max_steps": "95/114", "percentage": "83.33%", "elapsed_time": "3m 39s", "remaining_time": "43s"} +{"loss": 0.60710449, "grad_norm": 0.55163328, "learning_rate": 4.09e-06, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.428781, "rewards/chosen": 12.63749981, "rewards/rejected": 0.43593749, "rewards/accuracies": 1.0, "rewards/margins": 12.21249962, "logps/chosen": -518.79998779, "logps/rejected": -540.79998779, "logits/chosen": 0.06599121, "logits/rejected": 0.19042969, "nll_loss": 0.60703123, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "3m 50s", "remaining_time": "32s"} +{"eval_loss": 0.54272461, "eval_runtime": 1.0634, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.94, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.1875, "eval_logps/chosen": -298.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.515625, "eval_logits/rejected": 0.69140625, "eval_nll_loss": 0.51953125, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "3m 51s", "remaining_time": "32s"} +{"loss": 0.60441895, "grad_norm": 0.90852037, "learning_rate": 1.7e-06, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.426933, "rewards/chosen": 12.1875, "rewards/rejected": 0.84335935, "rewards/accuracies": 1.0, "rewards/margins": 11.32499981, "logps/chosen": -547.09997559, "logps/rejected": -508.0, "logits/chosen": 0.14899902, "logits/rejected": 0.14143066, "nll_loss": 0.60351562, "epoch": 2.76315789, "global_step/max_steps": "105/114", "percentage": "92.11%", "elapsed_time": "4m 3s", "remaining_time": "20s"} +{"loss": 0.53855286, "grad_norm": 0.52334158, "learning_rate": 3.4e-07, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.428375, "rewards/chosen": 12.39999962, "rewards/rejected": 0.85195315, "rewards/accuracies": 1.0, "rewards/margins": 11.53750038, "logps/chosen": -565.59997559, "logps/rejected": -526.20001221, "logits/chosen": 0.07299805, "logits/rejected": 0.19794922, "nll_loss": 0.53828126, "epoch": 2.89473684, "global_step/max_steps": "110/114", "percentage": "96.49%", "elapsed_time": "4m 13s", "remaining_time": "9s"} +{"eval_loss": 0.54199219, "eval_runtime": 1.0337, "eval_samples_per_second": 3.87, "eval_steps_per_second": 0.967, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.25, "eval_logps/chosen": -296.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.515625, "eval_logits/rejected": 0.6953125, "eval_nll_loss": 0.51953125, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "4m 24s", "remaining_time": "0s"} +{"train_runtime": 264.9961, "train_samples_per_second": 3.374, "train_steps_per_second": 0.43, "total_flos": 62364809592832.0, "train_loss": 0.86599745, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "4m 24s", "remaining_time": "0s"} +{"train_dataset": "1695.382550±899.293489, min=182.000000, max=4081.000000, size=298", "val_dataset": "1637.250000±797.581461, min=755.000000, max=2485.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 1786.3204M Params (9.2324M Trainable [0.5168%]), 0.0001M Buffers.", "last_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114", "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/checkpoint-114", "best_metric": 0.54199219, "global_step": 114, "log_history": [{"loss": 1.384765625, "grad_norm": 12.157362496957303, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 3.36, "train_speed(iter/s)": 0.136299, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -768.0, "logps/rejected": -284.0, "logits/chosen": 0.6875, "logits/rejected": 0.1416015625, "nll_loss": 0.69140625, "epoch": 0.02631578947368421, "step": 1}, {"loss": 2.017822265625, "grad_norm": 13.539150660119484, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 15.74, "train_speed(iter/s)": 0.30928, "rewards/chosen": 0.05777740478515625, "rewards/rejected": -0.04648590087890625, "rewards/accuracies": 0.4375, "rewards/margins": 0.1044921875, "logps/chosen": -809.5, "logps/rejected": -460.5, "logits/chosen": 0.068756103515625, "logits/rejected": -0.03271484375, "nll_loss": 1.37890625, "epoch": 0.13157894736842105, "step": 5}, {"loss": 2.158203125, "grad_norm": 10.599031383319412, "learning_rate": 9.966191788709716e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.364139, "rewards/chosen": 0.964062511920929, "rewards/rejected": 0.6314452886581421, "rewards/accuracies": 0.699999988079071, "rewards/margins": 0.3333984315395355, "logps/chosen": -724.0, "logps/rejected": -717.0, "logits/chosen": 0.18076172471046448, "logits/rejected": 0.168701171875, "nll_loss": 1.5546875, "epoch": 0.2631578947368421, "step": 10}, {"loss": 1.2904296875, "grad_norm": 3.41127146668536, "learning_rate": 9.829629131445342e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.391058, "rewards/chosen": 2.487499952316284, "rewards/rejected": 0.8617187738418579, "rewards/accuracies": 0.875, "rewards/margins": 1.62890625, "logps/chosen": -729.5999755859375, "logps/rejected": -533.5999755859375, "logits/chosen": 0.3662109375, "logits/rejected": 0.2630859315395355, "nll_loss": 0.981249988079071, "epoch": 0.39473684210526316, "step": 15}, {"loss": 1.4791015625, "grad_norm": 15.35539910021312, "learning_rate": 9.591080534401371e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.404763, "rewards/chosen": 4.234375, "rewards/rejected": 2.3671875, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 1.860937476158142, "logps/chosen": -484.3999938964844, "logps/rejected": -604.7999877929688, "logits/chosen": 0.12734374403953552, "logits/rejected": 0.15620116889476776, "nll_loss": 1.1945312023162842, "epoch": 0.5263157894736842, "step": 20}, {"eval_loss": 0.8779296875, "eval_runtime": 1.0501, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.952, "eval_rewards/chosen": 7.46875, "eval_rewards/rejected": 4.9375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 2.515625, "eval_logps/chosen": -346.0, "eval_logps/rejected": -1200.0, "eval_logits/chosen": -0.59765625, "eval_logits/rejected": 0.58203125, "eval_nll_loss": 0.66015625, "epoch": 0.5263157894736842, "step": 20}, {"loss": 1.101220703125, "grad_norm": 9.750705999680513, "learning_rate": 9.255583362184999e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.402214, "rewards/chosen": 6.09375, "rewards/rejected": 2.8531250953674316, "rewards/accuracies": 0.925000011920929, "rewards/margins": 3.2359375953674316, "logps/chosen": -708.7999877929688, "logps/rejected": -666.0, "logits/chosen": 0.0028198242653161287, "logits/rejected": 0.02851562574505806, "nll_loss": 0.8804687261581421, "epoch": 0.6578947368421053, "step": 25}, {"loss": 0.742919921875, "grad_norm": 1.1620640103930189, "learning_rate": 8.83022221559489e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.409863, "rewards/chosen": 7.668749809265137, "rewards/rejected": 2.6703124046325684, "rewards/accuracies": 1.0, "rewards/margins": 4.993750095367432, "logps/chosen": -493.20001220703125, "logps/rejected": -559.5999755859375, "logits/chosen": -0.14511719346046448, "logits/rejected": -0.010668945498764515, "nll_loss": 0.717968761920929, "epoch": 0.7894736842105263, "step": 30}, {"loss": 0.8309814453125, "grad_norm": 1.5082652670453853, "learning_rate": 8.323979328069689e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.415557, "rewards/chosen": 8.25, "rewards/rejected": 2.6031250953674316, "rewards/accuracies": 0.9750000238418579, "rewards/margins": 5.650000095367432, "logps/chosen": -783.2000122070312, "logps/rejected": -649.2000122070312, "logits/chosen": -0.0003417968691792339, "logits/rejected": -0.09726562350988388, "nll_loss": 0.778124988079071, "epoch": 0.9210526315789473, "step": 35}, {"loss": 0.725, "grad_norm": 1.1139559192769446, "learning_rate": 7.74754489035403e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.419666, "rewards/chosen": 8.856249809265137, "rewards/rejected": 1.6640625, "rewards/accuracies": 1.0, "rewards/margins": 7.199999809265137, "logps/chosen": -599.5999755859375, "logps/rejected": -630.7999877929688, "logits/chosen": 0.05820312350988388, "logits/rejected": -0.06640625, "nll_loss": 0.782031238079071, "epoch": 1.0526315789473684, "step": 40}, {"eval_loss": 0.583984375, "eval_runtime": 1.0568, "eval_samples_per_second": 3.785, "eval_steps_per_second": 0.946, "eval_rewards/chosen": 10.0625, "eval_rewards/rejected": 4.0625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.0, "eval_logps/chosen": -320.0, "eval_logps/rejected": -1208.0, "eval_logits/chosen": -0.73828125, "eval_logits/rejected": 0.52734375, "eval_nll_loss": 0.57421875, "epoch": 1.0526315789473684, "step": 40}, {"loss": 0.6896728515625, "grad_norm": 0.5832429010086848, "learning_rate": 7.113091308703498e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.41586, "rewards/chosen": 10.175000190734863, "rewards/rejected": 0.4398437440395355, "rewards/accuracies": 1.0, "rewards/margins": 9.737500190734863, "logps/chosen": -606.7999877929688, "logps/rejected": -670.4000244140625, "logits/chosen": 0.10170898586511612, "logits/rejected": 0.05078125, "nll_loss": 0.6890624761581421, "epoch": 1.1842105263157894, "step": 45}, {"loss": 0.64876708984375, "grad_norm": 0.49952125769013667, "learning_rate": 6.434016163555452e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.4198, "rewards/chosen": 10.912500381469727, "rewards/rejected": 0.546093761920929, "rewards/accuracies": 1.0, "rewards/margins": 10.362500190734863, "logps/chosen": -610.4000244140625, "logps/rejected": -663.5999755859375, "logits/chosen": -0.04960937425494194, "logits/rejected": -0.02602539025247097, "nll_loss": 0.647656261920929, "epoch": 1.3157894736842106, "step": 50}, {"loss": 0.68779296875, "grad_norm": 0.31824407050814685, "learning_rate": 5.724659296536233e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.423222, "rewards/chosen": 11.149999618530273, "rewards/rejected": 0.55908203125, "rewards/accuracies": 1.0, "rewards/margins": 10.612500190734863, "logps/chosen": -608.4000244140625, "logps/rejected": -621.5999755859375, "logits/chosen": 0.22453613579273224, "logits/rejected": 0.23637695610523224, "nll_loss": 0.6875, "epoch": 1.4473684210526316, "step": 55}, {"loss": 0.577880859375, "grad_norm": 0.23207078350156118, "learning_rate": 5e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425778, "rewards/chosen": 11.862500190734863, "rewards/rejected": 0.3648925721645355, "rewards/accuracies": 1.0, "rewards/margins": 11.5, "logps/chosen": -654.0, "logps/rejected": -466.79998779296875, "logits/chosen": 0.3818359375, "logits/rejected": -0.07106933742761612, "nll_loss": 0.578125, "epoch": 1.5789473684210527, "step": 60}, {"eval_loss": 0.55859375, "eval_runtime": 1.0788, "eval_samples_per_second": 3.708, "eval_steps_per_second": 0.927, "eval_rewards/chosen": 11.625, "eval_rewards/rejected": 5.34375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.3125, "eval_logps/chosen": -304.0, "eval_logps/rejected": -1192.0, "eval_logits/chosen": -0.6015625, "eval_logits/rejected": 0.65234375, "eval_nll_loss": 0.53515625, "epoch": 1.5789473684210527, "step": 60}, {"loss": 0.60504150390625, "grad_norm": 0.6795800719500261, "learning_rate": 4.275340703463767e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.423328, "rewards/chosen": 11.800000190734863, "rewards/rejected": 0.851757824420929, "rewards/accuracies": 1.0, "rewards/margins": 10.949999809265137, "logps/chosen": -590.4000244140625, "logps/rejected": -555.2000122070312, "logits/chosen": -0.07011719048023224, "logits/rejected": 0.01972656324505806, "nll_loss": 0.604296863079071, "epoch": 1.7105263157894737, "step": 65}, {"loss": 0.6089111328125, "grad_norm": 0.580148749174503, "learning_rate": 3.5659838364445505e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425442, "rewards/chosen": 10.725000381469727, "rewards/rejected": 0.565625011920929, "rewards/accuracies": 1.0, "rewards/margins": 10.162500381469727, "logps/chosen": -501.79998779296875, "logps/rejected": -551.4000244140625, "logits/chosen": 0.0538330078125, "logits/rejected": 0.05624999850988388, "nll_loss": 0.6078125238418579, "epoch": 1.8421052631578947, "step": 70}, {"loss": 0.688037109375, "grad_norm": 0.6687994812508333, "learning_rate": 2.886908691296504e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.427366, "rewards/chosen": 12.574999809265137, "rewards/rejected": -0.09501953423023224, "rewards/accuracies": 1.0, "rewards/margins": 12.662500381469727, "logps/chosen": -650.0, "logps/rejected": -695.2000122070312, "logits/chosen": 0.16015625, "logits/rejected": 0.23457030951976776, "nll_loss": 0.688281238079071, "epoch": 1.973684210526316, "step": 75}, {"loss": 0.6132568359375, "grad_norm": 0.779151521456302, "learning_rate": 2.25245510964597e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.428029, "rewards/chosen": 11.824999809265137, "rewards/rejected": -0.6976562738418579, "rewards/accuracies": 1.0, "rewards/margins": 12.537500381469727, "logps/chosen": -528.7999877929688, "logps/rejected": -652.7999877929688, "logits/chosen": 0.17226561903953552, "logits/rejected": 0.12241210788488388, "nll_loss": 0.602343738079071, "epoch": 2.1052631578947367, "step": 80}, {"eval_loss": 0.56982421875, "eval_runtime": 1.013, "eval_samples_per_second": 3.949, "eval_steps_per_second": 0.987, "eval_rewards/chosen": 12.1875, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.03125, "eval_logps/chosen": -298.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.53125, "eval_logits/rejected": 0.6875, "eval_nll_loss": 0.5234375, "epoch": 2.1052631578947367, "step": 80}, {"loss": 0.62239990234375, "grad_norm": 0.7098613362463773, "learning_rate": 1.6760206719303105e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.425977, "rewards/chosen": 12.862500190734863, "rewards/rejected": 1.6593749523162842, "rewards/accuracies": 1.0, "rewards/margins": 11.199999809265137, "logps/chosen": -527.2000122070312, "logps/rejected": -765.5999755859375, "logits/chosen": 0.06606445461511612, "logits/rejected": 0.2003173828125, "nll_loss": 0.622265636920929, "epoch": 2.236842105263158, "step": 85}, {"loss": 0.617724609375, "grad_norm": 0.37164934301407404, "learning_rate": 1.1697777844051105e-05, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.42749, "rewards/chosen": 12.512499809265137, "rewards/rejected": 0.37109375, "rewards/accuracies": 1.0, "rewards/margins": 12.149999618530273, "logps/chosen": -634.4000244140625, "logps/rejected": -591.5999755859375, "logits/chosen": 0.2582031190395355, "logits/rejected": 0.01315918006002903, "nll_loss": 0.6167968511581421, "epoch": 2.3684210526315788, "step": 90}, {"loss": 0.70126953125, "grad_norm": 0.2667601385654993, "learning_rate": 7.444166378150013e-06, "memory(GiB)": 36.36, "train_speed(iter/s)": 0.428148, "rewards/chosen": 13.4375, "rewards/rejected": 1.1515624523162842, "rewards/accuracies": 1.0, "rewards/margins": 12.287500381469727, "logps/chosen": -699.2000122070312, "logps/rejected": -696.7999877929688, "logits/chosen": 0.2818359434604645, "logits/rejected": 0.369140625, "nll_loss": 0.7015625238418579, "epoch": 2.5, "step": 95}, {"loss": 0.6071044921875, "grad_norm": 0.5516332791700669, "learning_rate": 4.089194655986306e-06, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.428781, "rewards/chosen": 12.637499809265137, "rewards/rejected": 0.4359374940395355, "rewards/accuracies": 1.0, "rewards/margins": 12.212499618530273, "logps/chosen": -518.7999877929688, "logps/rejected": -540.7999877929688, "logits/chosen": 0.06599120795726776, "logits/rejected": 0.1904296875, "nll_loss": 0.6070312261581421, "epoch": 2.6315789473684212, "step": 100}, {"eval_loss": 0.542724609375, "eval_runtime": 1.0634, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.94, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.1875, "eval_logps/chosen": -298.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.515625, "eval_logits/rejected": 0.69140625, "eval_nll_loss": 0.51953125, "epoch": 2.6315789473684212, "step": 100}, {"loss": 0.6044189453125, "grad_norm": 0.9085203696227837, "learning_rate": 1.70370868554659e-06, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.426933, "rewards/chosen": 12.1875, "rewards/rejected": 0.8433593511581421, "rewards/accuracies": 1.0, "rewards/margins": 11.324999809265137, "logps/chosen": -547.0999755859375, "logps/rejected": -508.0, "logits/chosen": 0.14899902045726776, "logits/rejected": 0.14143066108226776, "nll_loss": 0.603515625, "epoch": 2.763157894736842, "step": 105}, {"loss": 0.5385528564453125, "grad_norm": 0.5233415847185563, "learning_rate": 3.380821129028489e-07, "memory(GiB)": 45.6, "train_speed(iter/s)": 0.428375, "rewards/chosen": 12.399999618530273, "rewards/rejected": 0.8519531488418579, "rewards/accuracies": 1.0, "rewards/margins": 11.537500381469727, "logps/chosen": -565.5999755859375, "logps/rejected": -526.2000122070312, "logits/chosen": 0.072998046875, "logits/rejected": 0.19794921576976776, "nll_loss": 0.538281261920929, "epoch": 2.8947368421052633, "step": 110}, {"eval_loss": 0.5419921875, "eval_runtime": 1.0337, "eval_samples_per_second": 3.87, "eval_steps_per_second": 0.967, "eval_rewards/chosen": 12.375, "eval_rewards/rejected": 6.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 6.25, "eval_logps/chosen": -296.0, "eval_logps/rejected": -1184.0, "eval_logits/chosen": -0.515625, "eval_logits/rejected": 0.6953125, "eval_nll_loss": 0.51953125, "epoch": 3.0, "step": 114}, {"train_runtime": 264.9961, "train_samples_per_second": 3.374, "train_steps_per_second": 0.43, "total_flos": 62364809592832.0, "train_loss": 0.8659974483021519, "epoch": 3.0, "step": 114}], "memory": 45.6015625} diff --git a/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs/events.out.tfevents.1739308590.kml-task-540432-record-10109969-prod-worker-0.32067.0 b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs/events.out.tfevents.1739308590.kml-task-540432-record-10109969-prod-worker-0.32067.0 new file mode 100644 index 0000000000000000000000000000000000000000..8abdbefb91842c41d3ae1009c461b8f3cc9f0fdc --- /dev/null +++ b/deepseek-r1-1.5b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-211506/runs/events.out.tfevents.1739308590.kml-task-540432-record-10109969-prod-worker-0.32067.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95c04065c4fe0a46caceb861ded86c7919c2d6cfa9d76f4b5801693f56c228e +size 32212 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29815203a4fb371fd6594761c485983cd81c517f --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3a7dd6cbad656cf9f87d375ccfa5cfbc83f26e0f658b84ae65952042e9c1f5 +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89b757535b273f6e246204a36420ed0faf9ee546 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbb97f267b315e90bae422bd4b9656c2860c9abc8a82f7a0e6c5a9aef130da9 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7f884d8e0bbeee2537f887969f4ce76b488cb4d --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c221e5352d260fc5a7c1630a5831273193684f50bcc2f2fbb589f9b962bab15f +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5993f2b91f66800484b56df7ca923bd4b093f029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457afec1d391a2eef8153bc1e206df3a26df418b811a3f0c272078b4b36c9aff +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7676c5ef737392c9eca326cc7e6b153cbcc10714 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b043fa353db037b362bc671799e1d33edbf8e7a4646a23321129a02f07a040f2 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da1294ae816049943c1c7042c7ab01f6ac1aecc1 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a47e39f741c1b52fc87c1c388f6317cd2e353d97f15192732fee7e03ab82064 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cdb36f47bd024061f9e2fbc0354d42ab6ba94a3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82956324136e9616c6230a242a9500adf7e659e005de4157be63fafbc1f86596 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..704271adf3db24e075d282c98dded0cb746b0369 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f714ec0a3f5df771332a09e9d8262efa25f9b83c937f082b8c37d9a08e894ad +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35fe26034fe796d36a7c3490222dfbe272fc2003 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabf156dd4ad1578dc54bfccf42e5021e4bd26a454ac4010a50e5679da64e1c8 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca9e149d406f3ba060df00662843e208114186d6 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135a7241ef86deb44c06cb46106b144c6e0e0c6012a18d83a3b9900ee9887115 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70a1db716db77b20ce69e020211bd624d917fdc9 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:192dbf9a4ce2f4bf6aa21f7b28309443ecc22114d4c26ab7c31833ee4fd8ed6f +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..633a89f6bcf368edb6854d00497f4df874d8754c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2cf6cb84dcdbdfc470e75fcec2424f47825bfc445fc2f5c189a110fd89794a1 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f3a20ba8148315a06d9b6627ed5ee3d3db5255c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241cf79cc46b8720c3383c4d5e07258f71513067605f3af4e98ada709037fe6c +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb97335edaaf12e7d04b57d168811b35742d6e80 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68ea5493995f4bd480618ef95080063c299cb1bf9889e54e67f368e50295cf0 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..382f020457c5f2831932377914bded675ff5039e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97553c6ea6760e161bf7482142842224a0033997c14b61e006ff2f7ae10c024a +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..161855440189181ba47e10abb51733b1fc4e3093 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8946d6a0f45a11de886c6a8d31a7b51eabf81902df7507f1c93812c6b4b6f01 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26de83c3f3bce77c08aa0fdc646a85355a613bb0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e371c9fd2aa88159e817c8c5175b22abb35a52efc1c96e22a5a080c415017f82 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a1fb08c48e9d34df783eb19e7c9d1caf0ed386 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec37c3a15b8d061312402391f2fddb52d623a1416d6d2879a30f184450d844f +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1b2b8d5aff56087dd5ac8a7ef10f57d128f1aa --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.30786133, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80", + "epoch": 2.6315789473684212, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.7496208811504723, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.5984375476837158, + "logits/rejected": -1.626562476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -600.2000122070312, + "loss": 0.64052734375, + "memory(GiB)": 46.24, + "nll_loss": 0.610156238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.112500190734863, + "rewards/margins": 6.525000095367432, + "rewards/rejected": 3.5999999046325684, + "step": 25, + "train_speed(iter/s)": 0.148916 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8408714330433036, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.423437476158142, + "logits/rejected": -1.8078124523162842, + "logps/chosen": -411.5, + "logps/rejected": -513.4000244140625, + "loss": 0.5287109375, + "memory(GiB)": 46.24, + "nll_loss": 0.528124988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.162500381469727, + "rewards/margins": 9.125, + "rewards/rejected": 2.0218749046325684, + "step": 30, + "train_speed(iter/s)": 0.148793 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.7617614950729771, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.610937476158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -632.4000244140625, + "logps/rejected": -559.2000122070312, + "loss": 0.579290771484375, + "memory(GiB)": 46.24, + "nll_loss": 0.578906238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 10.274999618530273, + "rewards/rejected": 2.206249952316284, + "step": 35, + "train_speed(iter/s)": 0.149226 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7627215464350648, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -474.0, + "logps/rejected": -580.4000244140625, + "loss": 0.5338623046875, + "memory(GiB)": 46.24, + "nll_loss": 0.590624988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.649999618530273, + "rewards/margins": 11.875, + "rewards/rejected": 1.77734375, + "step": 40, + "train_speed(iter/s)": 0.150086 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.7734375, + "eval_logits/rejected": -1.734375, + "eval_logps/chosen": -160.0, + "eval_logps/rejected": -1104.0, + "eval_loss": 0.330322265625, + "eval_nll_loss": 0.330078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.0625, + "eval_rewards/margins": 12.125, + "eval_rewards/rejected": 0.8984375, + "eval_runtime": 2.32, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.431, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4870463862811011, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.4578125476837158, + "logits/rejected": -1.5656249523162842, + "logps/chosen": -445.0, + "logps/rejected": -619.5999755859375, + "loss": 0.44976806640625, + "memory(GiB)": 47.69, + "nll_loss": 0.44999998807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 14.837499618530273, + "rewards/rejected": 0.16914062201976776, + "step": 45, + "train_speed(iter/s)": 0.148124 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5046561338439631, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.4015624523162842, + "logits/rejected": -1.6484375, + "logps/chosen": -494.3999938964844, + "logps/rejected": -602.7999877929688, + "loss": 0.497467041015625, + "memory(GiB)": 47.69, + "nll_loss": 0.49726563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.850000381469727, + "rewards/margins": 15.300000190734863, + "rewards/rejected": 0.533007800579071, + "step": 50, + "train_speed(iter/s)": 0.149026 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.24557805720844658, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.3078124523162842, + "logits/rejected": -1.5078125, + "logps/chosen": -472.79998779296875, + "logps/rejected": -544.0, + "loss": 0.478399658203125, + "memory(GiB)": 47.69, + "nll_loss": 0.4781250059604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.412500381469727, + "rewards/margins": 15.337499618530273, + "rewards/rejected": 1.0632812976837158, + "step": 55, + "train_speed(iter/s)": 0.151404 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.17812446885778863, + "learning_rate": 5e-05, + "logits/chosen": -1.446874976158142, + "logits/rejected": -1.345312476158142, + "logps/chosen": -517.2000122070312, + "logps/rejected": -388.79998779296875, + "loss": 0.437841796875, + "memory(GiB)": 47.69, + "nll_loss": 0.4378906190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.787500381469727, + "rewards/margins": 15.712499618530273, + "rewards/rejected": 1.034765601158142, + "step": 60, + "train_speed(iter/s)": 0.152443 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.55078125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -152.0, + "eval_logps/rejected": -1088.0, + "eval_loss": 0.322998046875, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.875, + "eval_rewards/margins": 12.0625, + "eval_rewards/rejected": 1.8515625, + "eval_runtime": 2.3313, + "eval_samples_per_second": 1.716, + "eval_steps_per_second": 0.429, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.5905179750476125, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.3390624523162842, + "logits/rejected": -1.3562500476837158, + "logps/chosen": -453.0, + "logps/rejected": -488.79998779296875, + "loss": 0.42652587890625, + "memory(GiB)": 47.69, + "nll_loss": 0.42656248807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.825000762939453, + "rewards/margins": 16.549999237060547, + "rewards/rejected": 0.27421873807907104, + "step": 65, + "train_speed(iter/s)": 0.152062 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5364455470833442, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.192968726158142, + "logits/rejected": -1.48828125, + "logps/chosen": -405.0, + "logps/rejected": -501.3999938964844, + "loss": 0.455877685546875, + "memory(GiB)": 47.69, + "nll_loss": 0.45429688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.674999237060547, + "rewards/margins": 16.6875, + "rewards/rejected": -0.03144531324505806, + "step": 70, + "train_speed(iter/s)": 0.152032 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.5248526481142106, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.2296874523162842, + "logits/rejected": -1.334375023841858, + "logps/chosen": -513.2000122070312, + "logps/rejected": -614.4000244140625, + "loss": 0.4895751953125, + "memory(GiB)": 47.69, + "nll_loss": 0.4898437559604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.975000381469727, + "rewards/margins": 17.950000762939453, + "rewards/rejected": -0.9507812261581421, + "step": 75, + "train_speed(iter/s)": 0.152668 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5755623127495862, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.2664062976837158, + "logits/rejected": -1.2742187976837158, + "logps/chosen": -435.6000061035156, + "logps/rejected": -602.0, + "loss": 0.46800537109375, + "memory(GiB)": 47.69, + "nll_loss": 0.47265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.549999237060547, + "rewards/margins": 19.612499237060547, + "rewards/rejected": -3.076171875, + "step": 80, + "train_speed(iter/s)": 0.151515 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.423828125, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.307861328125, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.0625, + "eval_rewards/margins": 15.1875, + "eval_rewards/rejected": -1.140625, + "eval_runtime": 2.3353, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.428, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.433199990811398, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -1.0460937023162842, + "logits/rejected": -1.431249976158142, + "logps/chosen": -394.3999938964844, + "logps/rejected": -710.7999877929688, + "loss": 0.4093597412109375, + "memory(GiB)": 47.69, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.837499618530273, + "rewards/margins": 17.462499618530273, + "rewards/rejected": -0.6041015386581421, + "step": 85, + "train_speed(iter/s)": 0.151429 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2844696685876377, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -1.282812476158142, + "logits/rejected": -1.19921875, + "logps/chosen": -486.0, + "logps/rejected": -521.2000122070312, + "loss": 0.432275390625, + "memory(GiB)": 47.69, + "nll_loss": 0.43242186307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.975000381469727, + "rewards/margins": 18.75, + "rewards/rejected": -0.7632812261581421, + "step": 90, + "train_speed(iter/s)": 0.152662 + }, + { + "epoch": 2.5, + "grad_norm": 0.26464365821513314, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -1.1804687976837158, + "logits/rejected": -1.420312523841858, + "logps/chosen": -528.4000244140625, + "logps/rejected": -635.5999755859375, + "loss": 0.476739501953125, + "memory(GiB)": 47.69, + "nll_loss": 0.4769531190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.924999237060547, + "rewards/margins": 19.575000762939453, + "rewards/rejected": -0.6499999761581421, + "step": 95, + "train_speed(iter/s)": 0.152454 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.35279617616415065, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -1.181249976158142, + "logits/rejected": -1.482812523841858, + "logps/chosen": -404.3999938964844, + "logps/rejected": -512.4000244140625, + "loss": 0.403515625, + "memory(GiB)": 56.94, + "nll_loss": 0.40351563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.762500762939453, + "rewards/margins": 19.825000762939453, + "rewards/rejected": -3.0625, + "step": 100, + "train_speed(iter/s)": 0.153114 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -0.392578125, + "eval_logits/rejected": -1.5625, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.307861328125, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.125, + "eval_rewards/margins": 15.375, + "eval_rewards/rejected": -1.2421875, + "eval_runtime": 2.33, + "eval_samples_per_second": 1.717, + "eval_steps_per_second": 0.429, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 315676542894080.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..22d6606a897f2f465a882eaa26a40da760b17ee8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bb01a9c32831f6bb03d7572fe90a927f6ce26b5eb813a4089766613ede6152c +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0712f78ba10562f04b66431f0d696c094c72e1b2 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c5e8cdb1a71459f2054b36944d23b1ec02830cc30a3c8b650606fcc50820a43 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f808e45e8019772757134b6447a6e7dbba7c180 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc6230949cc4d4cfa42c4bee552b16711f77cd3204c6dd4c146f94d5c47a7e25 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3342dff91e663d4e58254d1d80bcac80962b4bff --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc323a16eb1a9d9314f9a7b472aa3de44eb1719bc60388aedfe274aab29b7ec5 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83570c12677770ec393e919f65d544d78b3aac35 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2401ddfc63c0e989e70a23430bc379759e0cd7614d79a16bf57e40187f5600f +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1a90b5bcc9a69d09a263c412c50f0d73a6b8525 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbfd0b83a0130992d8f8c2c97ce92b582041e63dfd4bbdb15b4fb0dfb55ef65a +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09ea0ee6d21582febaec498b77ef8b881a28d387 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b1fd051380c961e0a3ca866b54b65ceef5bd62ea08d208634187712186a645 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61608b54eb7b53f2f58074464cbd53f2d758a73a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c6a5d57df078d7b323ecab1ad5f8f7bfa570bf939361b0deefd8617155cfdb +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e705f56804dcd7f648396a1125cccc9b2dcc646c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be43129f795b18c5f9fb2235714f46766ce01835c9934da9cfdf3d04e1feb235 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8459674d5176bf1de6e3e48ebdb88d1c08100ceb --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de1c58d4f94c2dd9e1c44f5bdcc3b6161fa2d62e7556033df0903635c4e1b80f +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43eec646f564dc9e76d5e8f3171c89a45c056565 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcb2a4efc3e314ebb2f3cfcf6b8de4f301096f2cba69514fec3a1954e4286794 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f9873b05462fb31c872408c1fbc5fa06b1a5106 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8478ec52fe15cafbacba6a99ea433ac446dc52c8f7a424ad900dad132b6b710b +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8611ec147ba2721aab0a20b09c7a2b311ceaa65a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75c2cac370dd83bf6233c436aeffaac04de47b8a8a1f4f4c6e77b940ffe8b07 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfe9507b95e850e9c88a9948976528b6003497ee --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fb0a4cb7bd6c2886133af02a6a8d0a4e62892916c1de4f3b2ff45873e8e612d +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00fa8dd30af72b20b7d1f1266d652e4ab8b6d5e5 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:003881962508336e23d4fc3b713678d96db0284cf85af9600165a7e3bd006fd2 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2edd70e809603a7907dceeed982d3c475046fa6 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec29ddcf52e54e9ec33eb915e80f653168bf5c89efb4d8b1422ab6b66635df8 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..262e0ecfae1c1f9e8c14e118a094bd550a8b5d44 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c29a1f7426bafff0a760e962066b2fa7f441e039a0e792ab9f6f3c7c9d55eb6 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/latest new file mode 100644 index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/latest @@ -0,0 +1 @@ +global_step114 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a86ac614a477eb67963adb2c8c07f37c79ded059 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7a9fd18bda7faa50931342147a7de5605bed0f91f6c70d821e84b7bf8f444f +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6836e17342e8b386cab961cd56cafab04d65e88 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/trainer_state.json @@ -0,0 +1,549 @@ +{ + "best_metric": 0.30786133, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80", + "epoch": 3.0, + "eval_steps": 20, + "global_step": 114, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.7496208811504723, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.5984375476837158, + "logits/rejected": -1.626562476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -600.2000122070312, + "loss": 0.64052734375, + "memory(GiB)": 46.24, + "nll_loss": 0.610156238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.112500190734863, + "rewards/margins": 6.525000095367432, + "rewards/rejected": 3.5999999046325684, + "step": 25, + "train_speed(iter/s)": 0.148916 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8408714330433036, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.423437476158142, + "logits/rejected": -1.8078124523162842, + "logps/chosen": -411.5, + "logps/rejected": -513.4000244140625, + "loss": 0.5287109375, + "memory(GiB)": 46.24, + "nll_loss": 0.528124988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.162500381469727, + "rewards/margins": 9.125, + "rewards/rejected": 2.0218749046325684, + "step": 30, + "train_speed(iter/s)": 0.148793 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.7617614950729771, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.610937476158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -632.4000244140625, + "logps/rejected": -559.2000122070312, + "loss": 0.579290771484375, + "memory(GiB)": 46.24, + "nll_loss": 0.578906238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 10.274999618530273, + "rewards/rejected": 2.206249952316284, + "step": 35, + "train_speed(iter/s)": 0.149226 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7627215464350648, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -474.0, + "logps/rejected": -580.4000244140625, + "loss": 0.5338623046875, + "memory(GiB)": 46.24, + "nll_loss": 0.590624988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.649999618530273, + "rewards/margins": 11.875, + "rewards/rejected": 1.77734375, + "step": 40, + "train_speed(iter/s)": 0.150086 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.7734375, + "eval_logits/rejected": -1.734375, + "eval_logps/chosen": -160.0, + "eval_logps/rejected": -1104.0, + "eval_loss": 0.330322265625, + "eval_nll_loss": 0.330078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.0625, + "eval_rewards/margins": 12.125, + "eval_rewards/rejected": 0.8984375, + "eval_runtime": 2.32, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.431, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4870463862811011, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.4578125476837158, + "logits/rejected": -1.5656249523162842, + "logps/chosen": -445.0, + "logps/rejected": -619.5999755859375, + "loss": 0.44976806640625, + "memory(GiB)": 47.69, + "nll_loss": 0.44999998807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 14.837499618530273, + "rewards/rejected": 0.16914062201976776, + "step": 45, + "train_speed(iter/s)": 0.148124 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5046561338439631, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.4015624523162842, + "logits/rejected": -1.6484375, + "logps/chosen": -494.3999938964844, + "logps/rejected": -602.7999877929688, + "loss": 0.497467041015625, + "memory(GiB)": 47.69, + "nll_loss": 0.49726563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.850000381469727, + "rewards/margins": 15.300000190734863, + "rewards/rejected": 0.533007800579071, + "step": 50, + "train_speed(iter/s)": 0.149026 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.24557805720844658, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.3078124523162842, + "logits/rejected": -1.5078125, + "logps/chosen": -472.79998779296875, + "logps/rejected": -544.0, + "loss": 0.478399658203125, + "memory(GiB)": 47.69, + "nll_loss": 0.4781250059604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.412500381469727, + "rewards/margins": 15.337499618530273, + "rewards/rejected": 1.0632812976837158, + "step": 55, + "train_speed(iter/s)": 0.151404 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.17812446885778863, + "learning_rate": 5e-05, + "logits/chosen": -1.446874976158142, + "logits/rejected": -1.345312476158142, + "logps/chosen": -517.2000122070312, + "logps/rejected": -388.79998779296875, + "loss": 0.437841796875, + "memory(GiB)": 47.69, + "nll_loss": 0.4378906190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.787500381469727, + "rewards/margins": 15.712499618530273, + "rewards/rejected": 1.034765601158142, + "step": 60, + "train_speed(iter/s)": 0.152443 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.55078125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -152.0, + "eval_logps/rejected": -1088.0, + "eval_loss": 0.322998046875, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.875, + "eval_rewards/margins": 12.0625, + "eval_rewards/rejected": 1.8515625, + "eval_runtime": 2.3313, + "eval_samples_per_second": 1.716, + "eval_steps_per_second": 0.429, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.5905179750476125, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.3390624523162842, + "logits/rejected": -1.3562500476837158, + "logps/chosen": -453.0, + "logps/rejected": -488.79998779296875, + "loss": 0.42652587890625, + "memory(GiB)": 47.69, + "nll_loss": 0.42656248807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.825000762939453, + "rewards/margins": 16.549999237060547, + "rewards/rejected": 0.27421873807907104, + "step": 65, + "train_speed(iter/s)": 0.152062 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5364455470833442, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.192968726158142, + "logits/rejected": -1.48828125, + "logps/chosen": -405.0, + "logps/rejected": -501.3999938964844, + "loss": 0.455877685546875, + "memory(GiB)": 47.69, + "nll_loss": 0.45429688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.674999237060547, + "rewards/margins": 16.6875, + "rewards/rejected": -0.03144531324505806, + "step": 70, + "train_speed(iter/s)": 0.152032 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.5248526481142106, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.2296874523162842, + "logits/rejected": -1.334375023841858, + "logps/chosen": -513.2000122070312, + "logps/rejected": -614.4000244140625, + "loss": 0.4895751953125, + "memory(GiB)": 47.69, + "nll_loss": 0.4898437559604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.975000381469727, + "rewards/margins": 17.950000762939453, + "rewards/rejected": -0.9507812261581421, + "step": 75, + "train_speed(iter/s)": 0.152668 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5755623127495862, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.2664062976837158, + "logits/rejected": -1.2742187976837158, + "logps/chosen": -435.6000061035156, + "logps/rejected": -602.0, + "loss": 0.46800537109375, + "memory(GiB)": 47.69, + "nll_loss": 0.47265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.549999237060547, + "rewards/margins": 19.612499237060547, + "rewards/rejected": -3.076171875, + "step": 80, + "train_speed(iter/s)": 0.151515 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.423828125, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.307861328125, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.0625, + "eval_rewards/margins": 15.1875, + "eval_rewards/rejected": -1.140625, + "eval_runtime": 2.3353, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.428, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.433199990811398, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -1.0460937023162842, + "logits/rejected": -1.431249976158142, + "logps/chosen": -394.3999938964844, + "logps/rejected": -710.7999877929688, + "loss": 0.4093597412109375, + "memory(GiB)": 47.69, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.837499618530273, + "rewards/margins": 17.462499618530273, + "rewards/rejected": -0.6041015386581421, + "step": 85, + "train_speed(iter/s)": 0.151429 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2844696685876377, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -1.282812476158142, + "logits/rejected": -1.19921875, + "logps/chosen": -486.0, + "logps/rejected": -521.2000122070312, + "loss": 0.432275390625, + "memory(GiB)": 47.69, + "nll_loss": 0.43242186307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.975000381469727, + "rewards/margins": 18.75, + "rewards/rejected": -0.7632812261581421, + "step": 90, + "train_speed(iter/s)": 0.152662 + }, + { + "epoch": 2.5, + "grad_norm": 0.26464365821513314, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -1.1804687976837158, + "logits/rejected": -1.420312523841858, + "logps/chosen": -528.4000244140625, + "logps/rejected": -635.5999755859375, + "loss": 0.476739501953125, + "memory(GiB)": 47.69, + "nll_loss": 0.4769531190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 18.924999237060547, + "rewards/margins": 19.575000762939453, + "rewards/rejected": -0.6499999761581421, + "step": 95, + "train_speed(iter/s)": 0.152454 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.35279617616415065, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -1.181249976158142, + "logits/rejected": -1.482812523841858, + "logps/chosen": -404.3999938964844, + "logps/rejected": -512.4000244140625, + "loss": 0.403515625, + "memory(GiB)": 56.94, + "nll_loss": 0.40351563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.762500762939453, + "rewards/margins": 19.825000762939453, + "rewards/rejected": -3.0625, + "step": 100, + "train_speed(iter/s)": 0.153114 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -0.392578125, + "eval_logits/rejected": -1.5625, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.307861328125, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.125, + "eval_rewards/margins": 15.375, + "eval_rewards/rejected": -1.2421875, + "eval_runtime": 2.33, + "eval_samples_per_second": 1.717, + "eval_steps_per_second": 0.429, + "step": 100 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.7317102695797911, + "learning_rate": 1.70370868554659e-06, + "logits/chosen": -1.2179687023162842, + "logits/rejected": -1.306249976158142, + "logps/chosen": -434.3999938964844, + "logps/rejected": -481.3999938964844, + "loss": 0.43280029296875, + "memory(GiB)": 56.94, + "nll_loss": 0.4332031309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.837499618530273, + "rewards/margins": 18.625, + "rewards/rejected": -1.791406273841858, + "step": 105, + "train_speed(iter/s)": 0.152621 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.4831650405036417, + "learning_rate": 3.380821129028489e-07, + "logits/chosen": -1.1375000476837158, + "logits/rejected": -1.4609375, + "logps/chosen": -453.6000061035156, + "logps/rejected": -509.3999938964844, + "loss": 0.40185546875, + "memory(GiB)": 56.94, + "nll_loss": 0.4017578065395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.6875, + "rewards/margins": 19.875, + "rewards/rejected": -2.1617188453674316, + "step": 110, + "train_speed(iter/s)": 0.152896 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -0.388671875, + "eval_logits/rejected": -1.5625, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.30810546875, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.125, + "eval_rewards/margins": 15.375, + "eval_rewards/rejected": -1.2421875, + "eval_runtime": 2.2618, + "eval_samples_per_second": 1.768, + "eval_steps_per_second": 0.442, + "step": 114 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 359042439708672.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13bd6f121bb2871fedb0b6892264fec51947312a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39f598aea03e7e7f0382127b8f2e7d1c15f10684bfc676776a3fa066f709e303 +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..055442fdfc7c3226b0535811b8ea60d37455cc0b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab33f6dae797e5b238134246864595b12f6cb7e1d1a598c3457ffa424f702be +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..250b2261674a4a3286d3f0ce0a198ec14652af89 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f4c34c004aff93031ed7e70708e20a6e16437dc3b6f88849e484cb72032a30 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b43fc9706b6d7ef064197bcd29f553d978fee58 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d959920ece93ee73d9ba5aeeeef1bde6417d4cebf0315c39b1d09eb6f7d3f364 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca999e617a8eb0243dddfc465092b53bfe610cb5 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:625e5e0fe943d8630754dfc601edcba9431e45c4b0230b38eca4ead9e49568b6 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b6d0e37b9929c6f88152ddd037718f5ae73d84b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a490b7aff78bf30e9ca2173f4307cbdfbb763e85df70fa23f1affaedcc88b7 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98f0b9e05bdba9cc29aea9dc86d23c578881e2e3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2401d5992eaecd594376b887569d1d50326fa00dfda231eccbefe0beb20e89 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..835c743588bdee3e06f1234ce5272fd48079af25 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9270bb6c23d86c6a8012cd77997974fa7d2245a21aa1d074f4e1c10ddae0c948 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1189472b7d7cbee950640c37ba8b29852a7ecbb3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a73f46f5b7eebdbccf2a3634a7edad31419761ca8e702564b18c76af30769 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5130d746711120654ca2426fd9325519fb4041ff --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb8b752e1efea254a7dd063c7a380782d9d088d90f52f0b98beecb99f9a3a33 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c92428d46189606904531906f3537885acff80ce --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad9ef095f87e69e15ff6d0c28382a2c5e26d6220d8beea93988b13174b27895 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edfd39277dd98038bc12a651b0d2d95198525f5a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00b1356ade0dc96992e1800d507a66ea2e3cfdece9086d926b62b1b7c90d8f39 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3420a685deeae87b531fdacbabb87f53347a79f --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00134d988d18b32077303d4b1ea64330990480dd08ffe43f9d1cceff55168bbe +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..647de2f787636aa03f50cae70f5752468c5ef846 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6b9016e5acece1a252ec78e9131ff06d404907bec0dfb5602f451b368af726e +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bf3bf4e9efff4b6cbe0726cff0d9d2c9772f68c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8a41a745700889ef85bd8ae93c1e99d1a86ad2896345fe6b89c3303384e35a3 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe49ac604bcf6e3b08fdefec4eef507368590306 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bea0821a90bcff841a5e7426e8916a637db8f660ae626730d4d2a41abc29df0 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58e4885b3bee18977cd4f11e07d33445c94f9227 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9419dc8f61f9cc1f64feb206a8e5a43e599dd63bd4fc95d2eea2543a793cc71a +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e201402bb36891e48e2b7110304ad87df61a6070 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b40f5e8ba2f299f4eda41d6964ef1f313f53d1f8f687ebd6938ce3242fb4c3 +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..79dfaf17b1744152cee9177cd0ed560d10273be5 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.40576172, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20", + "epoch": 0.5263157894736842, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 60674878472192.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d99d6a76f9f1b39f807b4a0a0c733dd59eab8d0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08879f094a6ceb02a885ac94f910a85dc09684d943385d11df05b7123c2d7ee7 +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..647953138f2c0bb35a47d7bbe0d3df9f72711558 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb829f855ccb90e1aba901d0c69d38e76cb5ff6c64b5a705c32d2d44f3df36f +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7f9db06bd67c61cb11374a8269260a2a7152277 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3698f1cf6fa6433690fb5a21c3d643de6cd23a966c521b5404bbcfb6fce7b3d4 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5731998fd4d61d20a7ca1ab466011dcf5336e9f9 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:543c99fec0ae87a5de4297ddfe809e09cd87f5ac3ba67aa56592994c38ce557f +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a995e379e901190e7aedcf819379ca4bfaee8e28 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eaa3262fdc421843353c5ebfe7184adc536196a11098fe72c7061d43679c8af +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63e8bb4ddf96388a8b3a8ffe7cef053d8599599e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa13dfe1c62c99767b079dc386e31791c90d26a499e6cd0c19665710b73400a4 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1651eca506690582d24245efb798da9aaa4267fa --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef409e133eadca060ae9aa2155767cc4038b37781b7653abc349bcc384c0c447 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..567b1699dd9a1c26fc690531877afa9252f8e9f1 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b153ad7be5335b83e9b9f251f5f34dbe8062089694c23aebc604d0044adc925f +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7462f8b98ce3ad9158b4b74c236648a49a25db0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b68557a4ec29f746ebf5572eeaf615d6107fe0e3ff8450ea59820c8eb7edce0 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf1a17cc2c34f9e5964f4245f17ce5ef549f4b0a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e40f1d0e693da985e820d40836ce840dd0a25028c84556ce56686c43ff140f1 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a66fee2b5c2fe3149c446ed6eb6d2728370c0903 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3a96ab841ea8edc5a5733291239bc9a73a5883202f78e6eed03d160622ec74 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ce9820eaa0a2c5097a82e2ce328514f700772d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba82966f5e185c8fa4a4c3a40d7677291073ed0acb333c72db547022e2e98a5 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac1e7000e0026883b3b950ebbb9fa8e4a6637756 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5322378f0fd2b0e7e5da7d12f90e8ea7aee03cbf1ac0d791bc1ba833c16d7a1 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d127ffc627ca36bb21c6b1eca8a2bf521aa50da --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6131ee31fb6d82910bcad7f577f3fe59d1ff399837672e09db0e7235be4525 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c64e5a96bba7f588fe7a3099f90dc7d0510f05bc --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f618ace0e87a7ba721fe0d85f2c6a761784cbe5f4af680c8b2b820308ebf1b1e +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c752cd10b725602995cbd22b0f3377aaee9638b6 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a94fc621e5d26a398ddc4f86d846353ae3280bbaad91830e8ab5e6fc4eb06c9 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b700ad57e0cb16905fc47f8f90915c9cb0f0a60e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea74a3ceaa1e5bc415f8c86488fac08521bec0fcba669aaeb27599191ed83cdf +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5b7e2ec90fdb824c8932464c1d9068330655a7 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d8d7722fc72cab6d492b76cb99c8177dcc47544 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c9f84eff30cfa9ea1feedaf262d61fb12e4cba7 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eebfb928f8e91eff0ea1645a20b5aa4465c705b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0866030a266c6d003cc378a9418a723f69e8ab99 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..554638d77107f832d7aa51c61645ee2d6c48a36d --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..964331b65172a1bcac03e4673415fa787f724268 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd4754d65217d0f9d1f2d3334397df7a8a079652 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d2abd2d1feb7e9804d318f0409ab46d47248ca5 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0cfcde03016592eed8191f897341f523bbb99d728821c8afed66eae5a64729 +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4df7bf921577e35ccea51ae47040d0aa8db5ab9 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.33032227, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40", + "epoch": 1.0526315789473684, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.7496208811504723, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.5984375476837158, + "logits/rejected": -1.626562476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -600.2000122070312, + "loss": 0.64052734375, + "memory(GiB)": 46.24, + "nll_loss": 0.610156238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.112500190734863, + "rewards/margins": 6.525000095367432, + "rewards/rejected": 3.5999999046325684, + "step": 25, + "train_speed(iter/s)": 0.148916 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8408714330433036, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.423437476158142, + "logits/rejected": -1.8078124523162842, + "logps/chosen": -411.5, + "logps/rejected": -513.4000244140625, + "loss": 0.5287109375, + "memory(GiB)": 46.24, + "nll_loss": 0.528124988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.162500381469727, + "rewards/margins": 9.125, + "rewards/rejected": 2.0218749046325684, + "step": 30, + "train_speed(iter/s)": 0.148793 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.7617614950729771, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.610937476158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -632.4000244140625, + "logps/rejected": -559.2000122070312, + "loss": 0.579290771484375, + "memory(GiB)": 46.24, + "nll_loss": 0.578906238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 10.274999618530273, + "rewards/rejected": 2.206249952316284, + "step": 35, + "train_speed(iter/s)": 0.149226 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7627215464350648, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -474.0, + "logps/rejected": -580.4000244140625, + "loss": 0.5338623046875, + "memory(GiB)": 46.24, + "nll_loss": 0.590624988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.649999618530273, + "rewards/margins": 11.875, + "rewards/rejected": 1.77734375, + "step": 40, + "train_speed(iter/s)": 0.150086 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.7734375, + "eval_logits/rejected": -1.734375, + "eval_logps/chosen": -160.0, + "eval_logps/rejected": -1104.0, + "eval_loss": 0.330322265625, + "eval_nll_loss": 0.330078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.0625, + "eval_rewards/margins": 12.125, + "eval_rewards/rejected": 0.8984375, + "eval_runtime": 2.32, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.431, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 126357749366784.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2da6397f7ce9caa11fd60979e9176bdbc602774b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b14ae0c4b6845a2176d6865ce0da37fbcd2c6c345d9491bba2042b21c77b099 +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d0290619dc113e514917cf0dc281866819d5de0 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c21bf09221520cdd48eae5d075b5b8513e2815cd2a171059c6a3a783175d714e +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25667a98f76cb6c7e34dac7e181624a671ad3a95 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a7e2c32fd09bc1aa5b5211d559e43588116a1e0d52bd202304e5a91e24fc601 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e116d3f06119255fea13b6e3e08d858ebd5a464 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f2e2b5653fac74a3a34d665e4d8b2e9a0cb94849decc83f9933d508e92fc50a +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de17879742ec74e2f6fd922a61c5960f682ef7e7 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79cda937a79eae5352e52864975a72e5cf30c58259b7e6aa67c32af9a82158ac +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a4eb7d67e885f6576ba55728762c9e67668ad0e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1255986dfb710aff0461caf449ca3623892ab86cf715727ced39447cef2bf96 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62733d737f5af9d5362a3c070b72ea38ad170742 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee6ad8ecf5feb8ad1d3e81d382176b3169cf0734148cb960bd64ee8adc32ac7 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bd3a943b0fd5fa646dfcf70b906bce8b59efabf --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3d4efe709dbba94906f8e28b87d3b4d9fe43f63f95ca9c486d980a220d918a +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a5833b088e93fbb99e37fe1803c7ee285004ada --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee9f62296ad54ffe13dcc7713d2bc9dbec3cb95c4541d56f55189c9b25d4960 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8b27915d584017120bfda3775216a152bde3f56 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66220f862e2c3dd67da8e55b217c6f859bdcd750bc91bcae8cd4187bec83d5f7 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39678a7b42d257243311db797974bc6d840ca61d --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90c59fc47687a06ddabab80645f6459c0fa356d1baa3e018e54e35973306a2b +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3746c4f900b9cf2a3e1fb46abdf222cc0d55768 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e33b74425077e71ea8f0eeb2a46eb6386334f1fe2e5823571b5fae4404025745 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbbaa06191b38815bf1dba5997763877e2a55ddc --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a34f7727948a4bc7af444b024f0fbcd83061ad03b8f2508a8fdb12de764ce18c +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16d70953f9f251efbc7a2d4a296e9d83a84f46aa --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6102669d8598e888d410d3659df314192472daffdef2300185aa66f476d01d5f +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..205ba59e4644121b0767b59fa0caf54c1d33c2ad --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d606c2d6d6a798d8e8afa5ff20b2b90f48096da1876a6651c174a67c593496c1 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca4bc2793355aa5d8c3441ab83207af424b67ed2 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3eab23e5296eb9b2e69de82061d3920166f5f01699f4135ce3881343a45ab2f +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ad436d84d14bae09d18f76a653263f398551a7a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef80c795f708c4fb202e6f85a83e39b1634f8026f45bb7ebe26d28019e726a56 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24bb2a6ed10249209e94b434ed554cac856d563 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3a6465b9cb557a3a4db2097cdb877b1c624f5f645895d0cd27357a78258aa4 +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60a4d9d7b0f3f6b33068e800921619a486cd08e4 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.32299805, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60", + "epoch": 1.5789473684210527, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.7496208811504723, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.5984375476837158, + "logits/rejected": -1.626562476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -600.2000122070312, + "loss": 0.64052734375, + "memory(GiB)": 46.24, + "nll_loss": 0.610156238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.112500190734863, + "rewards/margins": 6.525000095367432, + "rewards/rejected": 3.5999999046325684, + "step": 25, + "train_speed(iter/s)": 0.148916 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8408714330433036, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.423437476158142, + "logits/rejected": -1.8078124523162842, + "logps/chosen": -411.5, + "logps/rejected": -513.4000244140625, + "loss": 0.5287109375, + "memory(GiB)": 46.24, + "nll_loss": 0.528124988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.162500381469727, + "rewards/margins": 9.125, + "rewards/rejected": 2.0218749046325684, + "step": 30, + "train_speed(iter/s)": 0.148793 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.7617614950729771, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.610937476158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -632.4000244140625, + "logps/rejected": -559.2000122070312, + "loss": 0.579290771484375, + "memory(GiB)": 46.24, + "nll_loss": 0.578906238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 10.274999618530273, + "rewards/rejected": 2.206249952316284, + "step": 35, + "train_speed(iter/s)": 0.149226 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7627215464350648, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -474.0, + "logps/rejected": -580.4000244140625, + "loss": 0.5338623046875, + "memory(GiB)": 46.24, + "nll_loss": 0.590624988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.649999618530273, + "rewards/margins": 11.875, + "rewards/rejected": 1.77734375, + "step": 40, + "train_speed(iter/s)": 0.150086 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.7734375, + "eval_logits/rejected": -1.734375, + "eval_logps/chosen": -160.0, + "eval_logps/rejected": -1104.0, + "eval_loss": 0.330322265625, + "eval_nll_loss": 0.330078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.0625, + "eval_rewards/margins": 12.125, + "eval_rewards/rejected": 0.8984375, + "eval_runtime": 2.32, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.431, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4870463862811011, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.4578125476837158, + "logits/rejected": -1.5656249523162842, + "logps/chosen": -445.0, + "logps/rejected": -619.5999755859375, + "loss": 0.44976806640625, + "memory(GiB)": 47.69, + "nll_loss": 0.44999998807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 14.837499618530273, + "rewards/rejected": 0.16914062201976776, + "step": 45, + "train_speed(iter/s)": 0.148124 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5046561338439631, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.4015624523162842, + "logits/rejected": -1.6484375, + "logps/chosen": -494.3999938964844, + "logps/rejected": -602.7999877929688, + "loss": 0.497467041015625, + "memory(GiB)": 47.69, + "nll_loss": 0.49726563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.850000381469727, + "rewards/margins": 15.300000190734863, + "rewards/rejected": 0.533007800579071, + "step": 50, + "train_speed(iter/s)": 0.149026 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.24557805720844658, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.3078124523162842, + "logits/rejected": -1.5078125, + "logps/chosen": -472.79998779296875, + "logps/rejected": -544.0, + "loss": 0.478399658203125, + "memory(GiB)": 47.69, + "nll_loss": 0.4781250059604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.412500381469727, + "rewards/margins": 15.337499618530273, + "rewards/rejected": 1.0632812976837158, + "step": 55, + "train_speed(iter/s)": 0.151404 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.17812446885778863, + "learning_rate": 5e-05, + "logits/chosen": -1.446874976158142, + "logits/rejected": -1.345312476158142, + "logps/chosen": -517.2000122070312, + "logps/rejected": -388.79998779296875, + "loss": 0.437841796875, + "memory(GiB)": 47.69, + "nll_loss": 0.4378906190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.787500381469727, + "rewards/margins": 15.712499618530273, + "rewards/rejected": 1.034765601158142, + "step": 60, + "train_speed(iter/s)": 0.152443 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.55078125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -152.0, + "eval_logps/rejected": -1088.0, + "eval_loss": 0.322998046875, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.875, + "eval_rewards/margins": 12.0625, + "eval_rewards/rejected": 1.8515625, + "eval_runtime": 2.3313, + "eval_samples_per_second": 1.716, + "eval_steps_per_second": 0.429, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 189633194622976.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/README.md b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c5c0404813bb3d884c2d0750e24391042738c029 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-14b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f229c21d96b79be7d935d8753f6044f91e1d5b --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "o_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_model.safetensors b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..63c1c39a00f3c7225b1750e49dc6489c32662b15 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5c8642257b11c16ebc79ce24b31406e1325b9dc1fa31c58d6611a28dc88234 +size 68902296 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/additional_config.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/args.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..0756d2a52ad9348f0bd930f099dd745d3ae6f547 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-14b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-14b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3adea28c2384fc0cbc404decea6a9018694f4b1 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:748b61f271ff50d58ed69a14016f9c34cc88906d4011ac2be397f9ddefc00f14 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1563719cc5fd1daca4df651c840d8129e58a07b2 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a8467ef84899b8bf3c0d4b69061eea5c2539c8f0e0a73fd95211c85797ea73 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a5eb47173d309af99e5ef480fa0a6079eed8cf8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809d30dfaf0d88b6ef1deac24ce1fd101bbd5db3c4330f8861fd5f02ffc6c2d +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b19cfc4542de94e4077bceeedc36b1df2331e2c4 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afab59c3b3b2f54e7df37526b200d7f9124f6a4c52c7c37cb3287eddaa18f1f0 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c02cc112cdab48cf5b79f51592c97d3e9b1f4a5a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce0e87c0d62baeaacb60d51390b7c451264e3a6c13b053747c3e671fdef72a53 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a92125a786dc51232877bd06397addc5f0bbb8 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228d6e2912dad6c22906411e24cdda44e33ada65b75ca2cc84014766a5c7fbb3 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fea17e6170c9ef42aaa722285e7e3a570748ddfd --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e97ed9495a8b1fad69d851a253f5d53ef602b70e178bb29d4415053bebdb3bb +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccc9e740cc4269285e52354607bb294b59761eea --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5957075b5150725c81fe09b2dff64ed297cb7ba3f8874c5a1d59b507a040e0 +size 51613616 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05e86de4a177fb97f0c143afaf9dcc4098b99be6 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a132ec25e6c1d34867a625464c63159b473e9e36f82fb5ad09c21b0f0e2f0ac +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f9133a15655d28ff1e1f4efe91c31e7e23f8652 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d5c0caa1f5dc62ea83df7fabafe5a04998d7fc8bda5c0c096bc17d662d172e +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5daf083fe068e189b1f8baafddbec11464dc22e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3560ed019a7041d0b4499958fd90fdf8b65fb1b9ffd6586fd02b6dedf0fd3e6 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a512bf6fde687517d099706ebfc3772cf0dd0b7 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbd85a56f951ff2f07deefe86539ea952c880679b1c4e2f9ee7c3ed56420987d +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c40d0dd137dc6a8b7a558a55ecba8ee4ead596d --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e677785727f2607581fef5a18b4352b48abdf0cfd722ec2ce42ad93eee2a43 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82f1cd849c85a6b1686d68e80c752956384ff336 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1543250bde591b5d07d479b057fc29110efdeaec2a01eb5197442e26369a83 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..060173f536423c04902395d83daad4cabb4c0d1c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:becacd733d0ad2583d7f13e1391b1f851461e96d21383c9b925451006db3e502 +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe8dfdf82d58293ce827ea0289aa96663a6b6d89 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d35f02bb2d4604e9f127721a4072884330f4ef742c0a8fb81a39c810f2874b +size 664974 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/latest b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_0.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_1.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_2.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_3.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_4.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_5.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_6.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_7.pth b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/scheduler.pt b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf96d6803aea265d756d902db3c4cc2386f9742 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90524bcdb94734ac7120e4205110f14662bff8cee00eed50355875dcdc538029 +size 1064 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/trainer_state.json b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..49402bcc214d2f7ffb9e1afda342fb15f330dcff --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.30786133, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80", + "epoch": 2.1052631578947367, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.742470529381032, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -2.15625, + "logits/rejected": -1.5703125, + "logps/chosen": -704.0, + "logps/rejected": -266.0, + "loss": 1.3349609375, + "memory(GiB)": 9.88, + "nll_loss": 0.64453125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.087917 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 13.977731082873543, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.611328125, + "logits/rejected": -1.634765625, + "logps/chosen": -719.5, + "logps/rejected": -400.5, + "loss": 2.087646484375, + "memory(GiB)": 19.62, + "nll_loss": 1.4091796875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.05169677734375, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.027721405029296875, + "step": 5, + "train_speed(iter/s)": 0.140484 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 7.808526234047937, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.529687523841858, + "logits/rejected": -1.6765625476837158, + "logps/chosen": -612.7999877929688, + "logps/rejected": -635.7999877929688, + "loss": 1.9408203125, + "memory(GiB)": 46.24, + "nll_loss": 1.470312476158142, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8820312023162842, + "rewards/margins": 0.841015636920929, + "rewards/rejected": 1.0390625, + "step": 10, + "train_speed(iter/s)": 0.141462 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 3.1525120590855167, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.7109375, + "logits/rejected": -1.7843749523162842, + "logps/chosen": -625.2000122070312, + "logps/rejected": -439.3999938964844, + "loss": 1.054736328125, + "memory(GiB)": 46.24, + "nll_loss": 0.815625011920929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 5.650000095367432, + "rewards/margins": 3.207812547683716, + "rewards/rejected": 2.4453125, + "step": 15, + "train_speed(iter/s)": 0.151619 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.821418612981352, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.7531249523162842, + "logps/chosen": -399.3999938964844, + "logps/rejected": -522.0, + "loss": 0.8697021484375, + "memory(GiB)": 46.24, + "nll_loss": 0.8140624761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.362500190734863, + "rewards/margins": 4.324999809265137, + "rewards/rejected": 4.043749809265137, + "step": 20, + "train_speed(iter/s)": 0.152154 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -0.890625, + "eval_logits/rejected": -1.78125, + "eval_logps/chosen": -180.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.40576171875, + "eval_nll_loss": 0.3828125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.125, + "eval_rewards/margins": 5.5, + "eval_rewards/rejected": 5.625, + "eval_runtime": 2.3009, + "eval_samples_per_second": 1.738, + "eval_steps_per_second": 0.435, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.7496208811504723, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.5984375476837158, + "logits/rejected": -1.626562476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -600.2000122070312, + "loss": 0.64052734375, + "memory(GiB)": 46.24, + "nll_loss": 0.610156238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.112500190734863, + "rewards/margins": 6.525000095367432, + "rewards/rejected": 3.5999999046325684, + "step": 25, + "train_speed(iter/s)": 0.148916 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8408714330433036, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.423437476158142, + "logits/rejected": -1.8078124523162842, + "logps/chosen": -411.5, + "logps/rejected": -513.4000244140625, + "loss": 0.5287109375, + "memory(GiB)": 46.24, + "nll_loss": 0.528124988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.162500381469727, + "rewards/margins": 9.125, + "rewards/rejected": 2.0218749046325684, + "step": 30, + "train_speed(iter/s)": 0.148793 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.7617614950729771, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.610937476158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -632.4000244140625, + "logps/rejected": -559.2000122070312, + "loss": 0.579290771484375, + "memory(GiB)": 46.24, + "nll_loss": 0.578906238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 10.274999618530273, + "rewards/rejected": 2.206249952316284, + "step": 35, + "train_speed(iter/s)": 0.149226 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.7627215464350648, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -474.0, + "logps/rejected": -580.4000244140625, + "loss": 0.5338623046875, + "memory(GiB)": 46.24, + "nll_loss": 0.590624988079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.649999618530273, + "rewards/margins": 11.875, + "rewards/rejected": 1.77734375, + "step": 40, + "train_speed(iter/s)": 0.150086 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -0.7734375, + "eval_logits/rejected": -1.734375, + "eval_logps/chosen": -160.0, + "eval_logps/rejected": -1104.0, + "eval_loss": 0.330322265625, + "eval_nll_loss": 0.330078125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.0625, + "eval_rewards/margins": 12.125, + "eval_rewards/rejected": 0.8984375, + "eval_runtime": 2.32, + "eval_samples_per_second": 1.724, + "eval_steps_per_second": 0.431, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4870463862811011, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.4578125476837158, + "logits/rejected": -1.5656249523162842, + "logps/chosen": -445.0, + "logps/rejected": -619.5999755859375, + "loss": 0.44976806640625, + "memory(GiB)": 47.69, + "nll_loss": 0.44999998807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.0, + "rewards/margins": 14.837499618530273, + "rewards/rejected": 0.16914062201976776, + "step": 45, + "train_speed(iter/s)": 0.148124 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5046561338439631, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.4015624523162842, + "logits/rejected": -1.6484375, + "logps/chosen": -494.3999938964844, + "logps/rejected": -602.7999877929688, + "loss": 0.497467041015625, + "memory(GiB)": 47.69, + "nll_loss": 0.49726563692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.850000381469727, + "rewards/margins": 15.300000190734863, + "rewards/rejected": 0.533007800579071, + "step": 50, + "train_speed(iter/s)": 0.149026 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.24557805720844658, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.3078124523162842, + "logits/rejected": -1.5078125, + "logps/chosen": -472.79998779296875, + "logps/rejected": -544.0, + "loss": 0.478399658203125, + "memory(GiB)": 47.69, + "nll_loss": 0.4781250059604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.412500381469727, + "rewards/margins": 15.337499618530273, + "rewards/rejected": 1.0632812976837158, + "step": 55, + "train_speed(iter/s)": 0.151404 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.17812446885778863, + "learning_rate": 5e-05, + "logits/chosen": -1.446874976158142, + "logits/rejected": -1.345312476158142, + "logps/chosen": -517.2000122070312, + "logps/rejected": -388.79998779296875, + "loss": 0.437841796875, + "memory(GiB)": 47.69, + "nll_loss": 0.4378906190395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.787500381469727, + "rewards/margins": 15.712499618530273, + "rewards/rejected": 1.034765601158142, + "step": 60, + "train_speed(iter/s)": 0.152443 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -0.55078125, + "eval_logits/rejected": -1.609375, + "eval_logps/chosen": -152.0, + "eval_logps/rejected": -1088.0, + "eval_loss": 0.322998046875, + "eval_nll_loss": 0.3125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.875, + "eval_rewards/margins": 12.0625, + "eval_rewards/rejected": 1.8515625, + "eval_runtime": 2.3313, + "eval_samples_per_second": 1.716, + "eval_steps_per_second": 0.429, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.5905179750476125, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.3390624523162842, + "logits/rejected": -1.3562500476837158, + "logps/chosen": -453.0, + "logps/rejected": -488.79998779296875, + "loss": 0.42652587890625, + "memory(GiB)": 47.69, + "nll_loss": 0.42656248807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.825000762939453, + "rewards/margins": 16.549999237060547, + "rewards/rejected": 0.27421873807907104, + "step": 65, + "train_speed(iter/s)": 0.152062 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5364455470833442, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.192968726158142, + "logits/rejected": -1.48828125, + "logps/chosen": -405.0, + "logps/rejected": -501.3999938964844, + "loss": 0.455877685546875, + "memory(GiB)": 47.69, + "nll_loss": 0.45429688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.674999237060547, + "rewards/margins": 16.6875, + "rewards/rejected": -0.03144531324505806, + "step": 70, + "train_speed(iter/s)": 0.152032 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.5248526481142106, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.2296874523162842, + "logits/rejected": -1.334375023841858, + "logps/chosen": -513.2000122070312, + "logps/rejected": -614.4000244140625, + "loss": 0.4895751953125, + "memory(GiB)": 47.69, + "nll_loss": 0.4898437559604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.975000381469727, + "rewards/margins": 17.950000762939453, + "rewards/rejected": -0.9507812261581421, + "step": 75, + "train_speed(iter/s)": 0.152668 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5755623127495862, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.2664062976837158, + "logits/rejected": -1.2742187976837158, + "logps/chosen": -435.6000061035156, + "logps/rejected": -602.0, + "loss": 0.46800537109375, + "memory(GiB)": 47.69, + "nll_loss": 0.47265625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.549999237060547, + "rewards/margins": 19.612499237060547, + "rewards/rejected": -3.076171875, + "step": 80, + "train_speed(iter/s)": 0.151515 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -0.423828125, + "eval_logits/rejected": -1.5703125, + "eval_logps/chosen": -150.0, + "eval_logps/rejected": -1120.0, + "eval_loss": 0.307861328125, + "eval_nll_loss": 0.30859375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.0625, + "eval_rewards/margins": 15.1875, + "eval_rewards/rejected": -1.140625, + "eval_runtime": 2.3353, + "eval_samples_per_second": 1.713, + "eval_steps_per_second": 0.428, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 253721088098304.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/training_args.bin b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0f9c1513f988aabfc34ba541849784465bac6329 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e811855c17405f7bd5d7032b14eb256861d319a7a7facc3093caef660f4dc162 +size 9016 diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/zero_to_fp32.py b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..635428250c9d9c229f4cc91beaa51bffa8dfb019 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..f3a637abba1acd90080a6986d86b29e3ac772ef0 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logits_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..f413aae77d06a93d91901a2ad58af31a1874581f Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..7a6ec3f6aa10cd5a067090ef8f8e93ed4319c496 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_logps_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_loss.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..5920c99f90e3bce638eda9d43e2b1696965ddd93 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_loss.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_nll_loss.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..bb4bf9c1e4708f3bb33c44e171b905a6322f84af Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_nll_loss.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_accuracies.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..b359853ac747638a7e44efa30e8bcf81f3befcfc Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_accuracies.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..9e84c7e468f4740b9998b5dc71ec24074688340b Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_margins.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..da4557e6ab4266b9c03136d1ca31c96e9dc85454 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_margins.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..d594467963fb98b233047ce811808772f6c65e26 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_rewards_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_runtime.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..14768decfb1d8c655f9e638f8230c26fcc4f3a9f Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_runtime.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_samples_per_second.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..16bd090d1d8ef81799d0255ad379f0c1f3ed7266 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_samples_per_second.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_steps_per_second.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..294dbe211d4243177a93e390557119cec7a618b6 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/eval_steps_per_second.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_epoch.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..c152c9bbb1100d680125e899d3d0da3db98dc221 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_epoch.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_grad_norm.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..cfbe752c37b9b7dfd3a124bf3c131187b8c25c68 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_grad_norm.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_learning_rate.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..e35e42c173f451f9666ba448872c615528441c9e Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_learning_rate.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..65f99912bec79b682c484fa658284c5659665ca7 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..01ac5e75ec33608db37da1c92fdc71d9ba844e3c Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logits_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..843ffe13f3531eca26072a563605719e092ec27b Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..a5260f0ab6e177ff6579dca6acb3e27c68bc6e42 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_logps_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_loss.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..d2cf383faa39fd4e3a0f8802aaff57ee959db6d8 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_loss.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_memory(GiB).png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..4ccce56e14006fe5fd0211e7e1ae9ec2b71db910 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_memory(GiB).png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_nll_loss.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..619fc21a6050d2b028b901a1f5d49d2236f96dde Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_nll_loss.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_accuracies.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..4f87a8d44447b1644ce277b180d87b00e540261e Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_accuracies.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_chosen.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..51def6cbcc1fcf323eb1dc8263773e467c3c785c Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_chosen.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_margins.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..08e5b579aa3e8939dfc4aaf781225384d3938586 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_margins.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_rejected.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..a7fa6938edc01ac7cdb58ac0dcec6ec156bc836d Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_rewards_rejected.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_total_flos.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..91f6835564dae58eae5bf77ed6c4b4c481c8f6d9 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_total_flos.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_loss.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..50cd4472964bd5c4f13650fc36d74a50324a537b Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_loss.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_runtime.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..733364f140fb32af0c0d7878452e4fa23a93837c Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_runtime.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_samples_per_second.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..a6d65944cdacd95f63a24e242224140bc952473f Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_samples_per_second.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_speed(iter_s).png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..2530b4304392406658c677b1988f3f5f336fb01e Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_speed(iter_s).png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_steps_per_second.png b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..37807332e1867ba0ce61b27269b169976c5d1e97 Binary files /dev/null and b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/images/train_train_steps_per_second.png differ diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/logging.jsonl b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f805970753407a65e8434b260ddb4b256f577ba4 --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/logging.jsonl @@ -0,0 +1,31 @@ +{"loss": 1.33496094, "grad_norm": 9.74247053, "learning_rate": 1.667e-05, "memory(GiB)": 9.88, "train_speed(iter/s)": 0.087917, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -704.0, "logps/rejected": -266.0, "logits/chosen": -2.15625, "logits/rejected": -1.5703125, "nll_loss": 0.64453125, "epoch": 0.02631579, "global_step/max_steps": "1/114", "percentage": "0.88%", "elapsed_time": "7s", "remaining_time": "14m 26s"} +{"loss": 2.08764648, "grad_norm": 13.97773108, "learning_rate": 8.333e-05, "memory(GiB)": 19.62, "train_speed(iter/s)": 0.140484, "rewards/chosen": 0.05169678, "rewards/rejected": 0.02772141, "rewards/accuracies": 0.1875, "rewards/margins": 0.02392578, "logps/chosen": -719.5, "logps/rejected": -400.5, "logits/chosen": -1.61132812, "logits/rejected": -1.63476562, "nll_loss": 1.40917969, "epoch": 0.13157895, "global_step/max_steps": "5/114", "percentage": "4.39%", "elapsed_time": "31s", "remaining_time": "11m 35s"} +{"loss": 1.94082031, "grad_norm": 7.80852623, "learning_rate": 9.966e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.141462, "rewards/chosen": 1.8820312, "rewards/rejected": 1.0390625, "rewards/accuracies": 0.75, "rewards/margins": 0.84101564, "logps/chosen": -612.79998779, "logps/rejected": -635.79998779, "logits/chosen": -1.52968752, "logits/rejected": -1.67656255, "nll_loss": 1.47031248, "epoch": 0.26315789, "global_step/max_steps": "10/114", "percentage": "8.77%", "elapsed_time": "1m 6s", "remaining_time": "11m 36s"} +{"loss": 1.05473633, "grad_norm": 3.15251206, "learning_rate": 9.83e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.151619, "rewards/chosen": 5.6500001, "rewards/rejected": 2.4453125, "rewards/accuracies": 0.89999998, "rewards/margins": 3.20781255, "logps/chosen": -625.20001221, "logps/rejected": -439.3999939, "logits/chosen": -1.7109375, "logits/rejected": -1.78437495, "nll_loss": 0.81562501, "epoch": 0.39473684, "global_step/max_steps": "15/114", "percentage": "13.16%", "elapsed_time": "1m 35s", "remaining_time": "10m 28s"} +{"loss": 0.86970215, "grad_norm": 1.82141861, "learning_rate": 9.591e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.152154, "rewards/chosen": 8.36250019, "rewards/rejected": 4.04374981, "rewards/accuracies": 1.0, "rewards/margins": 4.32499981, "logps/chosen": -399.3999939, "logps/rejected": -522.0, "logits/chosen": -1.64374995, "logits/rejected": -1.75312495, "nll_loss": 0.81406248, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "2m 7s", "remaining_time": "10m 0s"} +{"eval_loss": 0.40576172, "eval_runtime": 2.3009, "eval_samples_per_second": 1.738, "eval_steps_per_second": 0.435, "eval_rewards/chosen": 11.125, "eval_rewards/rejected": 5.625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.5, "eval_logps/chosen": -180.0, "eval_logps/rejected": -1056.0, "eval_logits/chosen": -0.890625, "eval_logits/rejected": -1.78125, "eval_nll_loss": 0.3828125, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "2m 10s", "remaining_time": "10m 11s"} +{"loss": 0.64052734, "grad_norm": 2.74962088, "learning_rate": 9.256e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.148916, "rewards/chosen": 10.11250019, "rewards/rejected": 3.5999999, "rewards/accuracies": 1.0, "rewards/margins": 6.5250001, "logps/chosen": -567.59997559, "logps/rejected": -600.20001221, "logits/chosen": -1.59843755, "logits/rejected": -1.62656248, "nll_loss": 0.61015624, "epoch": 0.65789474, "global_step/max_steps": "25/114", "percentage": "21.93%", "elapsed_time": "2m 44s", "remaining_time": "9m 44s"} +{"loss": 0.52871094, "grad_norm": 0.84087143, "learning_rate": 8.83e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.148793, "rewards/chosen": 11.16250038, "rewards/rejected": 2.0218749, "rewards/accuracies": 1.0, "rewards/margins": 9.125, "logps/chosen": -411.5, "logps/rejected": -513.40002441, "logits/chosen": -1.42343748, "logits/rejected": -1.80781245, "nll_loss": 0.52812499, "epoch": 0.78947368, "global_step/max_steps": "30/114", "percentage": "26.32%", "elapsed_time": "3m 17s", "remaining_time": "9m 14s"} +{"loss": 0.57929077, "grad_norm": 0.7617615, "learning_rate": 8.324e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.149226, "rewards/chosen": 12.47500038, "rewards/rejected": 2.20624995, "rewards/accuracies": 1.0, "rewards/margins": 10.27499962, "logps/chosen": -632.40002441, "logps/rejected": -559.20001221, "logits/chosen": -1.61093748, "logits/rejected": -1.68281245, "nll_loss": 0.57890624, "epoch": 0.92105263, "global_step/max_steps": "35/114", "percentage": "30.70%", "elapsed_time": "3m 50s", "remaining_time": "8m 41s"} +{"loss": 0.5338623, "grad_norm": 0.76272155, "learning_rate": 7.748e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.150086, "rewards/chosen": 13.64999962, "rewards/rejected": 1.77734375, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/chosen": -474.0, "logps/rejected": -580.40002441, "logits/chosen": -1.55156255, "logits/rejected": -1.62187505, "nll_loss": 0.59062499, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "4m 22s", "remaining_time": "8m 6s"} +{"eval_loss": 0.33032227, "eval_runtime": 2.32, "eval_samples_per_second": 1.724, "eval_steps_per_second": 0.431, "eval_rewards/chosen": 13.0625, "eval_rewards/rejected": 0.8984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.125, "eval_logps/chosen": -160.0, "eval_logps/rejected": -1104.0, "eval_logits/chosen": -0.7734375, "eval_logits/rejected": -1.734375, "eval_nll_loss": 0.33007812, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "4m 25s", "remaining_time": "8m 10s"} +{"loss": 0.44976807, "grad_norm": 0.48704639, "learning_rate": 7.113e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.148124, "rewards/chosen": 15.0, "rewards/rejected": 0.16914062, "rewards/accuracies": 1.0, "rewards/margins": 14.83749962, "logps/chosen": -445.0, "logps/rejected": -619.59997559, "logits/chosen": -1.45781255, "logits/rejected": -1.56562495, "nll_loss": 0.44999999, "epoch": 1.18421053, "global_step/max_steps": "45/114", "percentage": "39.47%", "elapsed_time": "5m 0s", "remaining_time": "7m 40s"} +{"loss": 0.49746704, "grad_norm": 0.50465613, "learning_rate": 6.434e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.149026, "rewards/chosen": 15.85000038, "rewards/rejected": 0.5330078, "rewards/accuracies": 1.0, "rewards/margins": 15.30000019, "logps/chosen": -494.3999939, "logps/rejected": -602.79998779, "logits/chosen": -1.40156245, "logits/rejected": -1.6484375, "nll_loss": 0.49726564, "epoch": 1.31578947, "global_step/max_steps": "50/114", "percentage": "43.86%", "elapsed_time": "5m 31s", "remaining_time": "7m 4s"} +{"loss": 0.47839966, "grad_norm": 0.24557806, "learning_rate": 5.725e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151404, "rewards/chosen": 16.41250038, "rewards/rejected": 1.0632813, "rewards/accuracies": 1.0, "rewards/margins": 15.33749962, "logps/chosen": -472.79998779, "logps/rejected": -544.0, "logits/chosen": -1.30781245, "logits/rejected": -1.5078125, "nll_loss": 0.47812501, "epoch": 1.44736842, "global_step/max_steps": "55/114", "percentage": "48.25%", "elapsed_time": "5m 59s", "remaining_time": "6m 25s"} +{"loss": 0.4378418, "grad_norm": 0.17812447, "learning_rate": 5e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152443, "rewards/chosen": 16.78750038, "rewards/rejected": 1.0347656, "rewards/accuracies": 1.0, "rewards/margins": 15.71249962, "logps/chosen": -517.20001221, "logps/rejected": -388.79998779, "logits/chosen": -1.44687498, "logits/rejected": -1.34531248, "nll_loss": 0.43789062, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "6m 29s", "remaining_time": "5m 50s"} +{"eval_loss": 0.32299805, "eval_runtime": 2.3313, "eval_samples_per_second": 1.716, "eval_steps_per_second": 0.429, "eval_rewards/chosen": 13.875, "eval_rewards/rejected": 1.8515625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.0625, "eval_logps/chosen": -152.0, "eval_logps/rejected": -1088.0, "eval_logits/chosen": -0.55078125, "eval_logits/rejected": -1.609375, "eval_nll_loss": 0.3125, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "6m 32s", "remaining_time": "5m 52s"} +{"loss": 0.42652588, "grad_norm": 0.59051798, "learning_rate": 4.275e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152062, "rewards/chosen": 16.82500076, "rewards/rejected": 0.27421874, "rewards/accuracies": 1.0, "rewards/margins": 16.54999924, "logps/chosen": -453.0, "logps/rejected": -488.79998779, "logits/chosen": -1.33906245, "logits/rejected": -1.35625005, "nll_loss": 0.42656249, "epoch": 1.71052632, "global_step/max_steps": "65/114", "percentage": "57.02%", "elapsed_time": "7m 3s", "remaining_time": "5m 19s"} +{"loss": 0.45587769, "grad_norm": 0.53644555, "learning_rate": 3.566e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152032, "rewards/chosen": 16.67499924, "rewards/rejected": -0.03144531, "rewards/accuracies": 1.0, "rewards/margins": 16.6875, "logps/chosen": -405.0, "logps/rejected": -501.3999939, "logits/chosen": -1.19296873, "logits/rejected": -1.48828125, "nll_loss": 0.45429689, "epoch": 1.84210526, "global_step/max_steps": "70/114", "percentage": "61.40%", "elapsed_time": "7m 36s", "remaining_time": "4m 47s"} +{"loss": 0.4895752, "grad_norm": 0.52485265, "learning_rate": 2.887e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152668, "rewards/chosen": 16.97500038, "rewards/rejected": -0.95078123, "rewards/accuracies": 1.0, "rewards/margins": 17.95000076, "logps/chosen": -513.20001221, "logps/rejected": -614.40002441, "logits/chosen": -1.22968745, "logits/rejected": -1.33437502, "nll_loss": 0.48984376, "epoch": 1.97368421, "global_step/max_steps": "75/114", "percentage": "65.79%", "elapsed_time": "8m 7s", "remaining_time": "4m 13s"} +{"loss": 0.46800537, "grad_norm": 0.57556231, "learning_rate": 2.252e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151515, "rewards/chosen": 16.54999924, "rewards/rejected": -3.07617188, "rewards/accuracies": 1.0, "rewards/margins": 19.61249924, "logps/chosen": -435.6000061, "logps/rejected": -602.0, "logits/chosen": -1.2664063, "logits/rejected": -1.2742188, "nll_loss": 0.47265625, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "8m 44s", "remaining_time": "3m 42s"} +{"eval_loss": 0.30786133, "eval_runtime": 2.3353, "eval_samples_per_second": 1.713, "eval_steps_per_second": 0.428, "eval_rewards/chosen": 14.0625, "eval_rewards/rejected": -1.140625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.1875, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.42382812, "eval_logits/rejected": -1.5703125, "eval_nll_loss": 0.30859375, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "8m 46s", "remaining_time": "3m 43s"} +{"loss": 0.40935974, "grad_norm": 0.43319999, "learning_rate": 1.676e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151429, "rewards/chosen": 16.83749962, "rewards/rejected": -0.60410154, "rewards/accuracies": 1.0, "rewards/margins": 17.46249962, "logps/chosen": -394.3999939, "logps/rejected": -710.79998779, "logits/chosen": -1.0460937, "logits/rejected": -1.43124998, "nll_loss": 0.40937501, "epoch": 2.23684211, "global_step/max_steps": "85/114", "percentage": "74.56%", "elapsed_time": "9m 17s", "remaining_time": "3m 10s"} +{"loss": 0.43227539, "grad_norm": 0.28446967, "learning_rate": 1.17e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152662, "rewards/chosen": 17.97500038, "rewards/rejected": -0.76328123, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/chosen": -486.0, "logps/rejected": -521.20001221, "logits/chosen": -1.28281248, "logits/rejected": -1.19921875, "nll_loss": 0.43242186, "epoch": 2.36842105, "global_step/max_steps": "90/114", "percentage": "78.95%", "elapsed_time": "9m 45s", "remaining_time": "2m 36s"} +{"loss": 0.4767395, "grad_norm": 0.26464366, "learning_rate": 7.44e-06, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152454, "rewards/chosen": 18.92499924, "rewards/rejected": -0.64999998, "rewards/accuracies": 1.0, "rewards/margins": 19.57500076, "logps/chosen": -528.40002441, "logps/rejected": -635.59997559, "logits/chosen": -1.1804688, "logits/rejected": -1.42031252, "nll_loss": 0.47695312, "epoch": 2.5, "global_step/max_steps": "95/114", "percentage": "83.33%", "elapsed_time": "10m 19s", "remaining_time": "2m 3s"} +{"loss": 0.40351562, "grad_norm": 0.35279618, "learning_rate": 4.09e-06, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.153114, "rewards/chosen": 16.76250076, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 19.82500076, "logps/chosen": -404.3999939, "logps/rejected": -512.40002441, "logits/chosen": -1.18124998, "logits/rejected": -1.48281252, "nll_loss": 0.40351564, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "10m 49s", "remaining_time": "1m 30s"} +{"eval_loss": 0.30786133, "eval_runtime": 2.33, "eval_samples_per_second": 1.717, "eval_steps_per_second": 0.429, "eval_rewards/chosen": 14.125, "eval_rewards/rejected": -1.2421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.375, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.39257812, "eval_logits/rejected": -1.5625, "eval_nll_loss": 0.30859375, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "10m 51s", "remaining_time": "1m 31s"} +{"loss": 0.43280029, "grad_norm": 0.73171027, "learning_rate": 1.7e-06, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.152621, "rewards/chosen": 16.83749962, "rewards/rejected": -1.79140627, "rewards/accuracies": 1.0, "rewards/margins": 18.625, "logps/chosen": -434.3999939, "logps/rejected": -481.3999939, "logits/chosen": -1.2179687, "logits/rejected": -1.30624998, "nll_loss": 0.43320313, "epoch": 2.76315789, "global_step/max_steps": "105/114", "percentage": "92.11%", "elapsed_time": "11m 24s", "remaining_time": "58s"} +{"loss": 0.40185547, "grad_norm": 0.48316504, "learning_rate": 3.4e-07, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.152896, "rewards/chosen": 17.6875, "rewards/rejected": -2.16171885, "rewards/accuracies": 1.0, "rewards/margins": 19.875, "logps/chosen": -453.6000061, "logps/rejected": -509.3999939, "logits/chosen": -1.13750005, "logits/rejected": -1.4609375, "nll_loss": 0.40175781, "epoch": 2.89473684, "global_step/max_steps": "110/114", "percentage": "96.49%", "elapsed_time": "11m 55s", "remaining_time": "26s"} +{"eval_loss": 0.30810547, "eval_runtime": 2.2618, "eval_samples_per_second": 1.768, "eval_steps_per_second": 0.442, "eval_rewards/chosen": 14.125, "eval_rewards/rejected": -1.2421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.375, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.38867188, "eval_logits/rejected": -1.5625, "eval_nll_loss": 0.30859375, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "12m 26s", "remaining_time": "0s"} +{"train_runtime": 748.1665, "train_samples_per_second": 1.195, "train_steps_per_second": 0.152, "total_flos": 359042439708672.0, "train_loss": 0.653896, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "12m 28s", "remaining_time": "0s"} +{"train_dataset": "1695.382550±899.293489, min=182.000000, max=4081.000000, size=298", "val_dataset": "1637.250000±797.581461, min=755.000000, max=2485.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 14804.4401M Params (34.4064M Trainable [0.2324%]), 0.0001M Buffers.", "last_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-114", "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/checkpoint-80", "best_metric": 0.30786133, "global_step": 114, "log_history": [{"loss": 1.3349609375, "grad_norm": 9.742470529381032, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 9.88, "train_speed(iter/s)": 0.087917, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -704.0, "logps/rejected": -266.0, "logits/chosen": -2.15625, "logits/rejected": -1.5703125, "nll_loss": 0.64453125, "epoch": 0.02631578947368421, "step": 1}, {"loss": 2.087646484375, "grad_norm": 13.977731082873543, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 19.62, "train_speed(iter/s)": 0.140484, "rewards/chosen": 0.05169677734375, "rewards/rejected": 0.027721405029296875, "rewards/accuracies": 0.1875, "rewards/margins": 0.02392578125, "logps/chosen": -719.5, "logps/rejected": -400.5, "logits/chosen": -1.611328125, "logits/rejected": -1.634765625, "nll_loss": 1.4091796875, "epoch": 0.13157894736842105, "step": 5}, {"loss": 1.9408203125, "grad_norm": 7.808526234047937, "learning_rate": 9.966191788709716e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.141462, "rewards/chosen": 1.8820312023162842, "rewards/rejected": 1.0390625, "rewards/accuracies": 0.75, "rewards/margins": 0.841015636920929, "logps/chosen": -612.7999877929688, "logps/rejected": -635.7999877929688, "logits/chosen": -1.529687523841858, "logits/rejected": -1.6765625476837158, "nll_loss": 1.470312476158142, "epoch": 0.2631578947368421, "step": 10}, {"loss": 1.054736328125, "grad_norm": 3.1525120590855167, "learning_rate": 9.829629131445342e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.151619, "rewards/chosen": 5.650000095367432, "rewards/rejected": 2.4453125, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 3.207812547683716, "logps/chosen": -625.2000122070312, "logps/rejected": -439.3999938964844, "logits/chosen": -1.7109375, "logits/rejected": -1.7843749523162842, "nll_loss": 0.815625011920929, "epoch": 0.39473684210526316, "step": 15}, {"loss": 0.8697021484375, "grad_norm": 1.821418612981352, "learning_rate": 9.591080534401371e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.152154, "rewards/chosen": 8.362500190734863, "rewards/rejected": 4.043749809265137, "rewards/accuracies": 1.0, "rewards/margins": 4.324999809265137, "logps/chosen": -399.3999938964844, "logps/rejected": -522.0, "logits/chosen": -1.6437499523162842, "logits/rejected": -1.7531249523162842, "nll_loss": 0.8140624761581421, "epoch": 0.5263157894736842, "step": 20}, {"eval_loss": 0.40576171875, "eval_runtime": 2.3009, "eval_samples_per_second": 1.738, "eval_steps_per_second": 0.435, "eval_rewards/chosen": 11.125, "eval_rewards/rejected": 5.625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 5.5, "eval_logps/chosen": -180.0, "eval_logps/rejected": -1056.0, "eval_logits/chosen": -0.890625, "eval_logits/rejected": -1.78125, "eval_nll_loss": 0.3828125, "epoch": 0.5263157894736842, "step": 20}, {"loss": 0.64052734375, "grad_norm": 2.7496208811504723, "learning_rate": 9.255583362184999e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.148916, "rewards/chosen": 10.112500190734863, "rewards/rejected": 3.5999999046325684, "rewards/accuracies": 1.0, "rewards/margins": 6.525000095367432, "logps/chosen": -567.5999755859375, "logps/rejected": -600.2000122070312, "logits/chosen": -1.5984375476837158, "logits/rejected": -1.626562476158142, "nll_loss": 0.610156238079071, "epoch": 0.6578947368421053, "step": 25}, {"loss": 0.5287109375, "grad_norm": 0.8408714330433036, "learning_rate": 8.83022221559489e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.148793, "rewards/chosen": 11.162500381469727, "rewards/rejected": 2.0218749046325684, "rewards/accuracies": 1.0, "rewards/margins": 9.125, "logps/chosen": -411.5, "logps/rejected": -513.4000244140625, "logits/chosen": -1.423437476158142, "logits/rejected": -1.8078124523162842, "nll_loss": 0.528124988079071, "epoch": 0.7894736842105263, "step": 30}, {"loss": 0.579290771484375, "grad_norm": 0.7617614950729771, "learning_rate": 8.323979328069689e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.149226, "rewards/chosen": 12.475000381469727, "rewards/rejected": 2.206249952316284, "rewards/accuracies": 1.0, "rewards/margins": 10.274999618530273, "logps/chosen": -632.4000244140625, "logps/rejected": -559.2000122070312, "logits/chosen": -1.610937476158142, "logits/rejected": -1.6828124523162842, "nll_loss": 0.578906238079071, "epoch": 0.9210526315789473, "step": 35}, {"loss": 0.5338623046875, "grad_norm": 0.7627215464350648, "learning_rate": 7.74754489035403e-05, "memory(GiB)": 46.24, "train_speed(iter/s)": 0.150086, "rewards/chosen": 13.649999618530273, "rewards/rejected": 1.77734375, "rewards/accuracies": 1.0, "rewards/margins": 11.875, "logps/chosen": -474.0, "logps/rejected": -580.4000244140625, "logits/chosen": -1.5515625476837158, "logits/rejected": -1.6218750476837158, "nll_loss": 0.590624988079071, "epoch": 1.0526315789473684, "step": 40}, {"eval_loss": 0.330322265625, "eval_runtime": 2.32, "eval_samples_per_second": 1.724, "eval_steps_per_second": 0.431, "eval_rewards/chosen": 13.0625, "eval_rewards/rejected": 0.8984375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.125, "eval_logps/chosen": -160.0, "eval_logps/rejected": -1104.0, "eval_logits/chosen": -0.7734375, "eval_logits/rejected": -1.734375, "eval_nll_loss": 0.330078125, "epoch": 1.0526315789473684, "step": 40}, {"loss": 0.44976806640625, "grad_norm": 0.4870463862811011, "learning_rate": 7.113091308703498e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.148124, "rewards/chosen": 15.0, "rewards/rejected": 0.16914062201976776, "rewards/accuracies": 1.0, "rewards/margins": 14.837499618530273, "logps/chosen": -445.0, "logps/rejected": -619.5999755859375, "logits/chosen": -1.4578125476837158, "logits/rejected": -1.5656249523162842, "nll_loss": 0.44999998807907104, "epoch": 1.1842105263157894, "step": 45}, {"loss": 0.497467041015625, "grad_norm": 0.5046561338439631, "learning_rate": 6.434016163555452e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.149026, "rewards/chosen": 15.850000381469727, "rewards/rejected": 0.533007800579071, "rewards/accuracies": 1.0, "rewards/margins": 15.300000190734863, "logps/chosen": -494.3999938964844, "logps/rejected": -602.7999877929688, "logits/chosen": -1.4015624523162842, "logits/rejected": -1.6484375, "nll_loss": 0.49726563692092896, "epoch": 1.3157894736842106, "step": 50}, {"loss": 0.478399658203125, "grad_norm": 0.24557805720844658, "learning_rate": 5.724659296536233e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151404, "rewards/chosen": 16.412500381469727, "rewards/rejected": 1.0632812976837158, "rewards/accuracies": 1.0, "rewards/margins": 15.337499618530273, "logps/chosen": -472.79998779296875, "logps/rejected": -544.0, "logits/chosen": -1.3078124523162842, "logits/rejected": -1.5078125, "nll_loss": 0.4781250059604645, "epoch": 1.4473684210526316, "step": 55}, {"loss": 0.437841796875, "grad_norm": 0.17812446885778863, "learning_rate": 5e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152443, "rewards/chosen": 16.787500381469727, "rewards/rejected": 1.034765601158142, "rewards/accuracies": 1.0, "rewards/margins": 15.712499618530273, "logps/chosen": -517.2000122070312, "logps/rejected": -388.79998779296875, "logits/chosen": -1.446874976158142, "logits/rejected": -1.345312476158142, "nll_loss": 0.4378906190395355, "epoch": 1.5789473684210527, "step": 60}, {"eval_loss": 0.322998046875, "eval_runtime": 2.3313, "eval_samples_per_second": 1.716, "eval_steps_per_second": 0.429, "eval_rewards/chosen": 13.875, "eval_rewards/rejected": 1.8515625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 12.0625, "eval_logps/chosen": -152.0, "eval_logps/rejected": -1088.0, "eval_logits/chosen": -0.55078125, "eval_logits/rejected": -1.609375, "eval_nll_loss": 0.3125, "epoch": 1.5789473684210527, "step": 60}, {"loss": 0.42652587890625, "grad_norm": 0.5905179750476125, "learning_rate": 4.275340703463767e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152062, "rewards/chosen": 16.825000762939453, "rewards/rejected": 0.27421873807907104, "rewards/accuracies": 1.0, "rewards/margins": 16.549999237060547, "logps/chosen": -453.0, "logps/rejected": -488.79998779296875, "logits/chosen": -1.3390624523162842, "logits/rejected": -1.3562500476837158, "nll_loss": 0.42656248807907104, "epoch": 1.7105263157894737, "step": 65}, {"loss": 0.455877685546875, "grad_norm": 0.5364455470833442, "learning_rate": 3.5659838364445505e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152032, "rewards/chosen": 16.674999237060547, "rewards/rejected": -0.03144531324505806, "rewards/accuracies": 1.0, "rewards/margins": 16.6875, "logps/chosen": -405.0, "logps/rejected": -501.3999938964844, "logits/chosen": -1.192968726158142, "logits/rejected": -1.48828125, "nll_loss": 0.45429688692092896, "epoch": 1.8421052631578947, "step": 70}, {"loss": 0.4895751953125, "grad_norm": 0.5248526481142106, "learning_rate": 2.886908691296504e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152668, "rewards/chosen": 16.975000381469727, "rewards/rejected": -0.9507812261581421, "rewards/accuracies": 1.0, "rewards/margins": 17.950000762939453, "logps/chosen": -513.2000122070312, "logps/rejected": -614.4000244140625, "logits/chosen": -1.2296874523162842, "logits/rejected": -1.334375023841858, "nll_loss": 0.4898437559604645, "epoch": 1.973684210526316, "step": 75}, {"loss": 0.46800537109375, "grad_norm": 0.5755623127495862, "learning_rate": 2.25245510964597e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151515, "rewards/chosen": 16.549999237060547, "rewards/rejected": -3.076171875, "rewards/accuracies": 1.0, "rewards/margins": 19.612499237060547, "logps/chosen": -435.6000061035156, "logps/rejected": -602.0, "logits/chosen": -1.2664062976837158, "logits/rejected": -1.2742187976837158, "nll_loss": 0.47265625, "epoch": 2.1052631578947367, "step": 80}, {"eval_loss": 0.307861328125, "eval_runtime": 2.3353, "eval_samples_per_second": 1.713, "eval_steps_per_second": 0.428, "eval_rewards/chosen": 14.0625, "eval_rewards/rejected": -1.140625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.1875, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.423828125, "eval_logits/rejected": -1.5703125, "eval_nll_loss": 0.30859375, "epoch": 2.1052631578947367, "step": 80}, {"loss": 0.4093597412109375, "grad_norm": 0.433199990811398, "learning_rate": 1.6760206719303105e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.151429, "rewards/chosen": 16.837499618530273, "rewards/rejected": -0.6041015386581421, "rewards/accuracies": 1.0, "rewards/margins": 17.462499618530273, "logps/chosen": -394.3999938964844, "logps/rejected": -710.7999877929688, "logits/chosen": -1.0460937023162842, "logits/rejected": -1.431249976158142, "nll_loss": 0.40937501192092896, "epoch": 2.236842105263158, "step": 85}, {"loss": 0.432275390625, "grad_norm": 0.2844696685876377, "learning_rate": 1.1697777844051105e-05, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152662, "rewards/chosen": 17.975000381469727, "rewards/rejected": -0.7632812261581421, "rewards/accuracies": 1.0, "rewards/margins": 18.75, "logps/chosen": -486.0, "logps/rejected": -521.2000122070312, "logits/chosen": -1.282812476158142, "logits/rejected": -1.19921875, "nll_loss": 0.43242186307907104, "epoch": 2.3684210526315788, "step": 90}, {"loss": 0.476739501953125, "grad_norm": 0.26464365821513314, "learning_rate": 7.444166378150013e-06, "memory(GiB)": 47.69, "train_speed(iter/s)": 0.152454, "rewards/chosen": 18.924999237060547, "rewards/rejected": -0.6499999761581421, "rewards/accuracies": 1.0, "rewards/margins": 19.575000762939453, "logps/chosen": -528.4000244140625, "logps/rejected": -635.5999755859375, "logits/chosen": -1.1804687976837158, "logits/rejected": -1.420312523841858, "nll_loss": 0.4769531190395355, "epoch": 2.5, "step": 95}, {"loss": 0.403515625, "grad_norm": 0.35279617616415065, "learning_rate": 4.089194655986306e-06, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.153114, "rewards/chosen": 16.762500762939453, "rewards/rejected": -3.0625, "rewards/accuracies": 1.0, "rewards/margins": 19.825000762939453, "logps/chosen": -404.3999938964844, "logps/rejected": -512.4000244140625, "logits/chosen": -1.181249976158142, "logits/rejected": -1.482812523841858, "nll_loss": 0.40351563692092896, "epoch": 2.6315789473684212, "step": 100}, {"eval_loss": 0.307861328125, "eval_runtime": 2.33, "eval_samples_per_second": 1.717, "eval_steps_per_second": 0.429, "eval_rewards/chosen": 14.125, "eval_rewards/rejected": -1.2421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.375, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.392578125, "eval_logits/rejected": -1.5625, "eval_nll_loss": 0.30859375, "epoch": 2.6315789473684212, "step": 100}, {"loss": 0.43280029296875, "grad_norm": 0.7317102695797911, "learning_rate": 1.70370868554659e-06, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.152621, "rewards/chosen": 16.837499618530273, "rewards/rejected": -1.791406273841858, "rewards/accuracies": 1.0, "rewards/margins": 18.625, "logps/chosen": -434.3999938964844, "logps/rejected": -481.3999938964844, "logits/chosen": -1.2179687023162842, "logits/rejected": -1.306249976158142, "nll_loss": 0.4332031309604645, "epoch": 2.763157894736842, "step": 105}, {"loss": 0.40185546875, "grad_norm": 0.4831650405036417, "learning_rate": 3.380821129028489e-07, "memory(GiB)": 56.94, "train_speed(iter/s)": 0.152896, "rewards/chosen": 17.6875, "rewards/rejected": -2.1617188453674316, "rewards/accuracies": 1.0, "rewards/margins": 19.875, "logps/chosen": -453.6000061035156, "logps/rejected": -509.3999938964844, "logits/chosen": -1.1375000476837158, "logits/rejected": -1.4609375, "nll_loss": 0.4017578065395355, "epoch": 2.8947368421052633, "step": 110}, {"eval_loss": 0.30810546875, "eval_runtime": 2.2618, "eval_samples_per_second": 1.768, "eval_steps_per_second": 0.442, "eval_rewards/chosen": 14.125, "eval_rewards/rejected": -1.2421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.375, "eval_logps/chosen": -150.0, "eval_logps/rejected": -1120.0, "eval_logits/chosen": -0.388671875, "eval_logits/rejected": -1.5625, "eval_nll_loss": 0.30859375, "epoch": 3.0, "step": 114}, {"train_runtime": 748.1665, "train_samples_per_second": 1.195, "train_steps_per_second": 0.152, "total_flos": 359042439708672.0, "train_loss": 0.6538959971645422, "epoch": 3.0, "step": 114}], "memory": 56.939453125} diff --git a/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs/events.out.tfevents.1739307211.kml-task-540432-record-10109969-prod-worker-0.26822.0 b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs/events.out.tfevents.1739307211.kml-task-540432-record-10109969-prod-worker-0.26822.0 new file mode 100644 index 0000000000000000000000000000000000000000..2056aa5e79643551a8df711ba971d4ab160b9ade --- /dev/null +++ b/deepseek-r1-14b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-205058/runs/events.out.tfevents.1739307211.kml-task-540432-record-10109969-prod-worker-0.26822.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08218e3afe58311f9bc2af02f3f7e6dd7ff589f3668219400227797c3821489a +size 32191 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/args.json new file mode 100644 index 0000000000000000000000000000000000000000..f91b0af53496ef06a59209f3d8bb439838e6bc25 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..06c6dc009569b1d8731cabeb7f823201b276f6fd --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "q_proj", + "o_proj", + "up_proj", + "v_proj", + "down_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..38ad1b636da12436e1a8228c5e6875da6d73b936 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b12a4bdd0be3dbe2c4ae0177d9133660f20b766fb2ecffbcc9e5da88722656 +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..f91b0af53496ef06a59209f3d8bb439838e6bc25 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b33e69fd03dd48853ffb39e9d37436c150963e8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746d5215b1ed6e55ec0f343e78ac181c2deb2f1bc8adb9e5666add85fd978205 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6699e6569bdaac6a651c7e8a372c873f1b5708ba --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eac749f024880bdbc6439dfb40cd92863c1d6de731fd0e2156ccb0adcb99c57f +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9139e8a22acb06cf8b73feb0755ad138d9511326 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:794a0aa25d5d0a7a459a24554f5c97c39bf2ef061a94b6b6ef5b1dad67549f50 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f41f672a681680f57692d020cb2afb7c4afb5ea3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d0feafd931c85964000e27b91c1211dd81272d65ff44f26b7755c8216d4c6b +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d60e89a9da863316314db406ec4007d548cf878f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f033adbcefd8a7cf9a471055828aa08c3e2f1f575238f6e5f60392c9f497011 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a582e92a0f28d972b651e4ebc1f90bbc9fbfb78 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6c6a5c46ef6a56ce66a36ef530873b94636ef6ed5a1bd2bc20a4517bfb4d477 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e283d04ad370f9b21801152c7c0f5776dd45d734 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f9093c57035a35a3ded4c3a6c21d9b6d358e330f7136413d067a6835eaa518 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..239b47bf2027002135e26f5b1b5a144ad12c42a3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ccb9a8af6132e2e94f263f8dac952938a426670917a2922fe999c2c6da07c06 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..568bd45d5b24b4795d70264d7269368bd244666a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bcd524627b2dad5b16b0a8edb70803106664642a8284c27bbe2dbecbdbed081 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c873e3cc664878fd567a173af654bb6edc01082c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6ad868c9b25cdcce1c572d1af1478c81bb85b133233d84e8fa210ae2cacbcb +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2310c72f6b7216381b678c6f8059c5d6bf503a1f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09658b5a6f4b67d5f95d493b56b7fea6c46219a443fd636e18b9b4f1861bffa7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bd1e6e63a6311e5cad0275b9d98362f2aa1648e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b749026718e43177a907e5c6de2f7f5a629128e982f84b67d4143a3a3070387a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c724345add8910891a1ee1da02a5c9218b8ec91 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2f67d96c86e28092b802f396768141688b61089904cb1315359c2432fb4b76 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b086667f2a457409146668d50c1a293b6eda5f5 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c578445d87812217a3dcaabedcd37902ba6905812ae38dde3ee9642e5be1939 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53ec2782ab6ef279988ab2245a2b1f3d17e17fac --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daa35a6dd309918b376938035b2611b50efee2bbdafd603637075b346da38c86 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..936e00196f3e863de78936cafae49fba3bf50657 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76ad7aa6e9d6acdcab0184b142efaf843b638347e89b34d1ef8a0979ecb9332 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e201402bb36891e48e2b7110304ad87df61a6070 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b40f5e8ba2f299f4eda41d6964ef1f313f53d1f8f687ebd6938ce3242fb4c3 +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33abe422c6051461ea7770298bb62b3e160eae58 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.38525391, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20", + "epoch": 0.5263157894736842, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865264262311847, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.041794 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.298379383762063, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.25, + "loss": 1.877197265625, + "memory(GiB)": 24.25, + "nll_loss": 1.181640625, + "rewards/accuracies": 0.28125, + "rewards/chosen": 0.02191162109375, + "rewards/margins": 0.01955413818359375, + "rewards/rejected": 0.0023345947265625, + "step": 5, + "train_speed(iter/s)": 0.072192 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 3.9576533365167594, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.5078125, + "logits/rejected": -1.564062476158142, + "logps/chosen": -567.5999755859375, + "logps/rejected": -591.5999755859375, + "loss": 1.73408203125, + "memory(GiB)": 56.39, + "nll_loss": 1.2625000476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.364843726158142, + "rewards/margins": 0.669543445110321, + "rewards/rejected": 0.6944335699081421, + "step": 10, + "train_speed(iter/s)": 0.074103 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.457193389734318, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.5906250476837158, + "logits/rejected": -1.6687500476837158, + "logps/chosen": -586.0, + "logps/rejected": -405.6000061035156, + "loss": 0.949169921875, + "memory(GiB)": 56.39, + "nll_loss": 0.7007812261581421, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.846875190734863, + "rewards/margins": 2.987499952316284, + "rewards/rejected": 1.8585937023162842, + "step": 15, + "train_speed(iter/s)": 0.079553 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.1542856270963138, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6359374523162842, + "logits/rejected": -1.609375, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7104736328125, + "memory(GiB)": 56.39, + "nll_loss": 0.6058593988418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 7.431250095367432, + "rewards/margins": 3.4906249046325684, + "rewards/rejected": 3.9375, + "step": 20, + "train_speed(iter/s)": 0.08046 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -148.0, + "eval_logps/rejected": -1016.0, + "eval_loss": 0.38525390625, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 3.90625, + "eval_rewards/rejected": 5.40625, + "eval_runtime": 4.451, + "eval_samples_per_second": 0.899, + "eval_steps_per_second": 0.225, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 80894479040512.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9cef6700c14123df9ea498234fba18f213c730f4 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afc8c7f993c3ac41570a78a0c830e80bdd2cd718c8174b0806a3d3df5b88553 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/logging.jsonl b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..262e7a25aaee26cc1a05b711cb701309dd62691d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/logging.jsonl @@ -0,0 +1,9 @@ +{"loss": 1.27929688, "grad_norm": 4.86526426, "learning_rate": 1.667e-05, "memory(GiB)": 14.31, "train_speed(iter/s)": 0.041794, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -664.0, "logps/rejected": -243.0, "logits/chosen": -1.625, "logits/rejected": -1.6015625, "nll_loss": 0.58984375, "epoch": 0.02631579, "global_step/max_steps": "1/114", "percentage": "0.88%", "elapsed_time": "12s", "remaining_time": "23m 28s"} +{"loss": 1.87719727, "grad_norm": 8.29837938, "learning_rate": 8.333e-05, "memory(GiB)": 24.25, "train_speed(iter/s)": 0.072192, "rewards/chosen": 0.02191162, "rewards/rejected": 0.00233459, "rewards/accuracies": 0.28125, "rewards/margins": 0.01955414, "logps/chosen": -676.25, "logps/rejected": -368.25, "logits/chosen": -1.57226562, "logits/rejected": -1.57226562, "nll_loss": 1.18164062, "epoch": 0.13157895, "global_step/max_steps": "5/114", "percentage": "4.39%", "elapsed_time": "57s", "remaining_time": "21m 0s"} +{"loss": 1.73408203, "grad_norm": 3.95765334, "learning_rate": 9.966e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.074103, "rewards/chosen": 1.36484373, "rewards/rejected": 0.69443357, "rewards/accuracies": 0.77499998, "rewards/margins": 0.66954345, "logps/chosen": -567.59997559, "logps/rejected": -591.59997559, "logits/chosen": -1.5078125, "logits/rejected": -1.56406248, "nll_loss": 1.26250005, "epoch": 0.26315789, "global_step/max_steps": "10/114", "percentage": "8.77%", "elapsed_time": "2m 3s", "remaining_time": "21m 24s"} +{"loss": 0.94916992, "grad_norm": 1.45719339, "learning_rate": 9.83e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.079553, "rewards/chosen": 4.84687519, "rewards/rejected": 1.8585937, "rewards/accuracies": 0.89999998, "rewards/margins": 2.98749995, "logps/chosen": -586.0, "logps/rejected": -405.6000061, "logits/chosen": -1.59062505, "logits/rejected": -1.66875005, "nll_loss": 0.70078123, "epoch": 0.39473684, "global_step/max_steps": "15/114", "percentage": "13.16%", "elapsed_time": "2m 57s", "remaining_time": "19m 28s"} +{"loss": 0.71047363, "grad_norm": 2.15428563, "learning_rate": 9.591e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.08046, "rewards/chosen": 7.4312501, "rewards/rejected": 3.9375, "rewards/accuracies": 0.97500002, "rewards/margins": 3.4906249, "logps/chosen": -369.0, "logps/rejected": -490.0, "logits/chosen": -1.63593745, "logits/rejected": -1.609375, "nll_loss": 0.6058594, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "3m 57s", "remaining_time": "18m 34s"} +{"eval_loss": 0.38525391, "eval_runtime": 4.451, "eval_samples_per_second": 0.899, "eval_steps_per_second": 0.225, "eval_rewards/chosen": 9.3125, "eval_rewards/rejected": 5.40625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 3.90625, "eval_logps/chosen": -148.0, "eval_logps/rejected": -1016.0, "eval_logits/chosen": -1.6328125, "eval_logits/rejected": -1.3046875, "eval_nll_loss": 0.33203125, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "4m 1s", "remaining_time": "18m 55s"} +{"loss": 0.53587646, "grad_norm": 1.02004692, "learning_rate": 9.256e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.079472, "rewards/chosen": 8.45625019, "rewards/rejected": 2.3499999, "rewards/accuracies": 1.0, "rewards/margins": 6.0999999, "logps/chosen": -537.59997559, "logps/rejected": -573.0, "logits/chosen": -1.62656248, "logits/rejected": -1.57500005, "nll_loss": 0.5113281, "epoch": 0.65789474, "global_step/max_steps": "25/114", "percentage": "21.93%", "elapsed_time": "5m 3s", "remaining_time": "17m 59s"} +{"loss": 0.46938477, "grad_norm": 0.87929473, "learning_rate": 8.83e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.079738, "rewards/chosen": 9.01249981, "rewards/rejected": 0.2453125, "rewards/accuracies": 1.0, "rewards/margins": 8.76875019, "logps/chosen": -395.6000061, "logps/rejected": -497.0, "logits/chosen": -1.640625, "logits/rejected": -1.67656255, "nll_loss": 0.46875, "epoch": 0.78947368, "global_step/max_steps": "30/114", "percentage": "26.32%", "elapsed_time": "6m 4s", "remaining_time": "17m 1s"} +{"loss": 0.53029175, "grad_norm": 0.40053357, "learning_rate": 8.324e-05, "memory(GiB)": 56.39, "train_speed(iter/s)": 0.080046, "rewards/chosen": 9.64999962, "rewards/rejected": -0.734375, "rewards/accuracies": 1.0, "rewards/margins": 10.38749981, "logps/chosen": -608.59997559, "logps/rejected": -541.20001221, "logits/chosen": -1.66562498, "logits/rejected": -1.68124998, "nll_loss": 0.52968752, "epoch": 0.92105263, "global_step/max_steps": "35/114", "percentage": "30.70%", "elapsed_time": "7m 5s", "remaining_time": "16m 1s"} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs/events.out.tfevents.1739305055.kml-task-540432-record-10109969-prod-worker-0.22315.0 b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs/events.out.tfevents.1739305055.kml-task-540432-record-10109969-prod-worker-0.22315.0 new file mode 100644 index 0000000000000000000000000000000000000000..6d9163b38b503d9e11817720a4874035f18725a8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-201638/runs/events.out.tfevents.1739305055.kml-task-540432-record-10109969-prod-worker-0.22315.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd93a7f025d27f6ba53bd8ee7b30778f446e70c0202f8a6ccec219645d978322 +size 15263 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..14d4fc6a92238ee9233319386a0325855248b3d2 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a842642733bdfd23c7a69fc2c11f6587f2d836d3b37d3349dbeebe52f004fac +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bae5ffedc48349d65a445b0bd38ad562aeddd8b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c63e39a57e26211bed233b3d8ceedc5c6287bc859a9f407860e7309efc67dd +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66ac6be4439dc3929729f248f8f1848fb6242ef7 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b379ae9cbbe69974e53350db7eee1515407944f1dfdfde3dffd782abbe0f366 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5c4634b98d37ad2d91d6b8e614c75652c4c259e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7f43f114f52d285c778e9faa8d66ccecf5cba87d00ce9d0d8bd70c0415897b +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99b9387e66e2d13e7c30f9cb003bc30c9ed188ed --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe28a12dc05683dbe92fd8f3af357d3e1b59857335c8316eac67852cd929a8b +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7be561d9385236e78cb985b2624f5c7efe6fb52a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:689b71a2029ce5b6b1801508253bb3a8b5af3ecdf8218dd3f9191abb8b7de3f7 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35703a3997c1285ad70ef81d1ff8240405be81cb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780324c4bd82d1ab26fa0332589208cca2a74a7a3034f4cc9ddebf4b4f54491d +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1676763798b72f030667ca0df8dc0ecef75ca346 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00586d3b74e38476e1ffcc4ef298c8688e74399ccf36b17550dea2e69b5692b0 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f07367d2314717dbfe1c2bb763c01345b913c54 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d21b001c4a764a716eee224058d1db2ceee94bd71aac5a65b2887831ca70956 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c7ae4432113ad4b11cf31e67d4c4a37f9f89937 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddfc67b3a517ef961c3b5705cc7acf2914eb49d3c4dc2992884fa09449846b3e +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4be062e0eceb3e9ac20c1d41d4b0cf14db42ac9 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a4df242d6635400e57556b99edee63cfff3a968ca0a2755b2ceef1f9012b05e +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b337383b9f00724e70df2259eb88dce09a19e1c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00564d9858ab14b7a140837c454f065a2420dc403617f17cb8a2e5118cd86eaa +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b9ba5870d8bb67867bda6c28f98a0ed4dbc9ca0 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:329d6ca2ce6026f0136c01384891858b0084fc96c796d6096100f085374bb838 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92251b57cb07b3bbffb2cf4e89796f689851c1bb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee250f073179786701924268a7356463dcb5ab4bb216a0319d44c81b37259f3 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b940ead0174e2c9a9d429788757291a89c76828b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642dc60def74ca84e35dfdf6296aa8289a65944e65a892aff15bf6e6547a71e0 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ad47e5822c97d26058f7cd02f16ce9d56ce270f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67b4b0b03435aab670516d7b779bd2586af114df30a200bc2fd9fa37b993aec +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..508375eb6c9c97918bb0d80d1c9da8d7c15173c1 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbfc7e2779ddf3c8f005358467f333da4c7d239138fd358ac7f6095abfcd63d1 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a1fb08c48e9d34df783eb19e7c9d1caf0ed386 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec37c3a15b8d061312402391f2fddb52d623a1416d6d2879a30f184450d844f +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9386a76b7e21779cc4cf85f0ed16a43f30cc48d9 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.28149414, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100", + "epoch": 2.6315789473684212, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.097155209357345, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.576562523841858, + "logps/chosen": -538.7999877929688, + "logps/rejected": -572.2000122070312, + "loss": 0.53927001953125, + "memory(GiB)": 56.27, + "nll_loss": 0.513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412500381469727, + "rewards/margins": 6.025000095367432, + "rewards/rejected": 2.371875047683716, + "step": 25, + "train_speed(iter/s)": 0.08148 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8998878137633866, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.681249976158142, + "logps/chosen": -396.3999938964844, + "logps/rejected": -498.3999938964844, + "loss": 0.46839599609375, + "memory(GiB)": 56.27, + "nll_loss": 0.46757811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.949999809265137, + "rewards/margins": 8.787500381469727, + "rewards/rejected": 0.15253905951976776, + "step": 30, + "train_speed(iter/s)": 0.081373 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4025215648853001, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.665624976158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -610.2000122070312, + "logps/rejected": -542.7999877929688, + "loss": 0.5325042724609375, + "memory(GiB)": 56.27, + "nll_loss": 0.532031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.512499809265137, + "rewards/margins": 10.462499618530273, + "rewards/rejected": -0.964062511920929, + "step": 35, + "train_speed(iter/s)": 0.081424 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.49646039527754704, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.703125, + "logps/chosen": -476.79998779296875, + "logps/rejected": -578.0, + "loss": 0.5092529296875, + "memory(GiB)": 56.27, + "nll_loss": 0.5648437738418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.912500381469727, + "rewards/margins": 11.987500190734863, + "rewards/rejected": -2.090625047683716, + "step": 40, + "train_speed(iter/s)": 0.081711 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.296875, + "eval_logps/chosen": -136.0, + "eval_logps/rejected": -1080.0, + "eval_loss": 0.294677734375, + "eval_nll_loss": 0.294921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.5, + "eval_rewards/margins": 11.1875, + "eval_rewards/rejected": -0.69921875, + "eval_runtime": 4.3578, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4969045495850996, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6015625, + "logps/chosen": -444.6000061035156, + "logps/rejected": -614.0, + "loss": 0.4313232421875, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.699999809265137, + "rewards/margins": 13.237500190734863, + "rewards/rejected": -2.549999952316284, + "step": 45, + "train_speed(iter/s)": 0.080571 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.2582495580909455, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7218749523162842, + "logps/chosen": -496.0, + "logps/rejected": -584.4000244140625, + "loss": 0.4834228515625, + "memory(GiB)": 57.72, + "nll_loss": 0.4828124940395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 13.399999618530273, + "rewards/rejected": -2.135937452316284, + "step": 50, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.1378301574581502, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.584375023841858, + "logits/rejected": -1.6453125476837158, + "logps/chosen": -475.6000061035156, + "logps/rejected": -534.4000244140625, + "loss": 0.46617431640625, + "memory(GiB)": 57.72, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 14.0, + "rewards/rejected": -2.214062452316284, + "step": 55, + "train_speed(iter/s)": 0.08215 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.12374674083229541, + "learning_rate": 5e-05, + "logits/chosen": -1.5671875476837158, + "logits/rejected": -1.6515624523162842, + "logps/chosen": -515.2000122070312, + "logps/rejected": -377.6000061035156, + "loss": 0.42254638671875, + "memory(GiB)": 57.72, + "nll_loss": 0.42304688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.524999618530273, + "rewards/margins": 14.125, + "rewards/rejected": -1.598046898841858, + "step": 60, + "train_speed(iter/s)": 0.082614 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.2890625, + "eval_logps/chosen": -131.0, + "eval_logps/rejected": -1072.0, + "eval_loss": 0.285400390625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": 0.2001953125, + "eval_runtime": 4.4113, + "eval_samples_per_second": 0.907, + "eval_steps_per_second": 0.227, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.25020620326140985, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.6765625476837158, + "logits/rejected": -1.5906250476837158, + "logps/chosen": -453.20001220703125, + "logps/rejected": -469.6000061035156, + "loss": 0.399969482421875, + "memory(GiB)": 57.72, + "nll_loss": 0.39921873807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 14.649999618530273, + "rewards/rejected": -2.171875, + "step": 65, + "train_speed(iter/s)": 0.082354 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.26138680103939765, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.703125, + "logps/chosen": -408.6000061035156, + "logps/rejected": -489.79998779296875, + "loss": 0.4444091796875, + "memory(GiB)": 57.72, + "nll_loss": 0.44453126192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 13.925000190734863, + "rewards/rejected": -1.6437499523162842, + "step": 70, + "train_speed(iter/s)": 0.082355 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.44633184477159904, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5499999523162842, + "logps/chosen": -506.3999938964844, + "logps/rejected": -582.4000244140625, + "loss": 0.4694580078125, + "memory(GiB)": 57.72, + "nll_loss": 0.46953123807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.550000190734863, + "rewards/margins": 14.512499809265137, + "rewards/rejected": -1.9695312976837158, + "step": 75, + "train_speed(iter/s)": 0.082702 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.3116745813140644, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.6687500476837158, + "logits/rejected": -1.609375, + "logps/chosen": -433.6000061035156, + "logps/rejected": -563.5999755859375, + "loss": 0.46090087890625, + "memory(GiB)": 57.72, + "nll_loss": 0.4710937440395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.612500190734863, + "rewards/margins": 15.0625, + "rewards/rejected": -2.4468750953674316, + "step": 80, + "train_speed(iter/s)": 0.08211 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.6171875, + "eval_logits/rejected": -1.28125, + "eval_logps/chosen": -130.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.28369140625, + "eval_nll_loss": 0.283203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": 1.5, + "eval_runtime": 4.4281, + "eval_samples_per_second": 0.903, + "eval_steps_per_second": 0.226, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.3778186960664974, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.592187523841858, + "logps/chosen": -384.3999938964844, + "logps/rejected": -660.0, + "loss": 0.4051177978515625, + "memory(GiB)": 57.72, + "nll_loss": 0.40507811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.675000190734863, + "rewards/margins": 13.087499618530273, + "rewards/rejected": -0.42578125, + "step": 85, + "train_speed(iter/s)": 0.082 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.18604144042181714, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.603124976158142, + "logps/chosen": -488.79998779296875, + "logps/rejected": -490.0, + "loss": 0.43151397705078126, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 15.612500190734863, + "rewards/rejected": -1.6233398914337158, + "step": 90, + "train_speed(iter/s)": 0.082606 + }, + { + "epoch": 2.5, + "grad_norm": 0.14745380356896057, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -1.640625, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -521.2000122070312, + "logps/rejected": -601.2000122070312, + "loss": 0.4540283203125, + "memory(GiB)": 57.72, + "nll_loss": 0.45390623807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.625, + "rewards/rejected": -1.818750023841858, + "step": 95, + "train_speed(iter/s)": 0.082454 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.27812231190579917, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.548437476158142, + "logps/chosen": -400.0, + "logps/rejected": -471.20001220703125, + "loss": 0.3977935791015625, + "memory(GiB)": 66.97, + "nll_loss": 0.3980468809604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.837499618530273, + "rewards/margins": 15.225000381469727, + "rewards/rejected": -2.390625, + "step": 100, + "train_speed(iter/s)": 0.082789 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.2734375, + "eval_logps/chosen": -129.0, + "eval_logps/rejected": -1064.0, + "eval_loss": 0.281494140625, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0625, + "eval_rewards/margins": 10.1875, + "eval_rewards/rejected": 0.90234375, + "eval_runtime": 4.383, + "eval_samples_per_second": 0.913, + "eval_steps_per_second": 0.228, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 420874174857216.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7e84ea0049fbf41ec98bcf0caa77bfb03eb8d31c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce0459fe5d78c8310d00b6520edd539e1c6cb68f35d92e6a064fe3f59fa90560 +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fefee2c55940aa04278108f33f2d0802d2a39a6b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43673679676f0d2270002997dacd681c5652fea3c6bfc2ff2d9c9eada9e05f6 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b91b8732b511ebe11ff2ddb59d7afc322b7dfa02 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:182b9b14c6b1a9e80cca5edf8e871e6e757c333d58ddc350921526cee4c52c36 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b96936616f778caf9164d0042867282f01d3228e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cadefbcdc07aa24abb180e47ead2f2ab83b1556223b145d28505d6398e6f9d1 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44fd9093d7d1b638c5b4914e627507fba2b49b8c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e23d804684d67f356b96ee96fe7aad52943a37dafac992be86be7aa9e43b7ac6 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd9b1b0ee991369ffb29de4691d3b533f758e2fc --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea7964efb8d541817128c5220571f91f755d42000f26d59605368f32dd888e09 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4df02880a3b017f9ff6504217f46133d3159a329 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827971ddf8e35934bc30f8d6cd3642ad0076926669a133e5acf87d8bd6c96b77 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2463d8044bf2caafa5849176a7bb2e95a8208b8c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df99d64e47a00916e7509f33991ae46f9598ee9915f9446c6e088b3dbd2b8582 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df6f7b92aa7babebfa7a1dcdfe71be3c283e3940 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7c091e6053ab89b6d9a41eb9bca3058c3a2b2d9b91c4aa798ea63feb0d839a +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53dc9a6bcc4e6233069b7970c0820e8435e16813 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54095c99836ae030118e2e030bc7eea5f651d02e644a0a76212771adf9d43fb6 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc50b2a8cc153aaccde56f64d7b0e4c72ff0ba09 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c8e755f3e36ce3faa3282733c9a7713b4e9fb6b7c76449877650009b7273b2 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfa3ae9fa4b8746734022a7e3ed29ee5ba3143da --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3a9739ed8863b740a5e9414e826780073db7ac9bc6d0a35f9c61f85e9df80f7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0f8ac127697dc96736d7d6d1d41156f9e4f29a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:473f65edb4ddb09b0c01b52e7029aa922171edbf07c0e70628012b5f5bc1bd7c +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ac70ad5d11dd3e2a380903dcd5cbb7b3cad9f69 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a0af1b4843d25f0e5fae735a8996e55d4c657bcd930240e80786f2ba0f694e +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..072e23fe09240f540de4effed0c74b58be373e17 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd5a00ccd32301ab09ca14e493b34750056e34c8c69eb1403fab1bd6efc836a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4298ea5ce265d5347a67f5d5b5f8940a76d317be --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8c938ce4ac141fdc6d26f69e85373475e7726d2f5fb964586cc8016d96d1710 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d98961c8fe7a7d1b4c8d1b0fcad732077dd62614 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0ed8f3af49e6ad29ddd7590f0827044c41016e508ba568599cc7081af5dc423 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/latest new file mode 100644 index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/latest @@ -0,0 +1 @@ +global_step114 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a86ac614a477eb67963adb2c8c07f37c79ded059 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7a9fd18bda7faa50931342147a7de5605bed0f91f6c70d821e84b7bf8f444f +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2e954df8f921d2c75671cd5ec651738466dd6752 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/trainer_state.json @@ -0,0 +1,549 @@ +{ + "best_metric": 0.28076172, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114", + "epoch": 3.0, + "eval_steps": 20, + "global_step": 114, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.097155209357345, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.576562523841858, + "logps/chosen": -538.7999877929688, + "logps/rejected": -572.2000122070312, + "loss": 0.53927001953125, + "memory(GiB)": 56.27, + "nll_loss": 0.513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412500381469727, + "rewards/margins": 6.025000095367432, + "rewards/rejected": 2.371875047683716, + "step": 25, + "train_speed(iter/s)": 0.08148 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8998878137633866, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.681249976158142, + "logps/chosen": -396.3999938964844, + "logps/rejected": -498.3999938964844, + "loss": 0.46839599609375, + "memory(GiB)": 56.27, + "nll_loss": 0.46757811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.949999809265137, + "rewards/margins": 8.787500381469727, + "rewards/rejected": 0.15253905951976776, + "step": 30, + "train_speed(iter/s)": 0.081373 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4025215648853001, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.665624976158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -610.2000122070312, + "logps/rejected": -542.7999877929688, + "loss": 0.5325042724609375, + "memory(GiB)": 56.27, + "nll_loss": 0.532031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.512499809265137, + "rewards/margins": 10.462499618530273, + "rewards/rejected": -0.964062511920929, + "step": 35, + "train_speed(iter/s)": 0.081424 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.49646039527754704, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.703125, + "logps/chosen": -476.79998779296875, + "logps/rejected": -578.0, + "loss": 0.5092529296875, + "memory(GiB)": 56.27, + "nll_loss": 0.5648437738418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.912500381469727, + "rewards/margins": 11.987500190734863, + "rewards/rejected": -2.090625047683716, + "step": 40, + "train_speed(iter/s)": 0.081711 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.296875, + "eval_logps/chosen": -136.0, + "eval_logps/rejected": -1080.0, + "eval_loss": 0.294677734375, + "eval_nll_loss": 0.294921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.5, + "eval_rewards/margins": 11.1875, + "eval_rewards/rejected": -0.69921875, + "eval_runtime": 4.3578, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4969045495850996, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6015625, + "logps/chosen": -444.6000061035156, + "logps/rejected": -614.0, + "loss": 0.4313232421875, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.699999809265137, + "rewards/margins": 13.237500190734863, + "rewards/rejected": -2.549999952316284, + "step": 45, + "train_speed(iter/s)": 0.080571 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.2582495580909455, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7218749523162842, + "logps/chosen": -496.0, + "logps/rejected": -584.4000244140625, + "loss": 0.4834228515625, + "memory(GiB)": 57.72, + "nll_loss": 0.4828124940395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 13.399999618530273, + "rewards/rejected": -2.135937452316284, + "step": 50, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.1378301574581502, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.584375023841858, + "logits/rejected": -1.6453125476837158, + "logps/chosen": -475.6000061035156, + "logps/rejected": -534.4000244140625, + "loss": 0.46617431640625, + "memory(GiB)": 57.72, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 14.0, + "rewards/rejected": -2.214062452316284, + "step": 55, + "train_speed(iter/s)": 0.08215 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.12374674083229541, + "learning_rate": 5e-05, + "logits/chosen": -1.5671875476837158, + "logits/rejected": -1.6515624523162842, + "logps/chosen": -515.2000122070312, + "logps/rejected": -377.6000061035156, + "loss": 0.42254638671875, + "memory(GiB)": 57.72, + "nll_loss": 0.42304688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.524999618530273, + "rewards/margins": 14.125, + "rewards/rejected": -1.598046898841858, + "step": 60, + "train_speed(iter/s)": 0.082614 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.2890625, + "eval_logps/chosen": -131.0, + "eval_logps/rejected": -1072.0, + "eval_loss": 0.285400390625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": 0.2001953125, + "eval_runtime": 4.4113, + "eval_samples_per_second": 0.907, + "eval_steps_per_second": 0.227, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.25020620326140985, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.6765625476837158, + "logits/rejected": -1.5906250476837158, + "logps/chosen": -453.20001220703125, + "logps/rejected": -469.6000061035156, + "loss": 0.399969482421875, + "memory(GiB)": 57.72, + "nll_loss": 0.39921873807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 14.649999618530273, + "rewards/rejected": -2.171875, + "step": 65, + "train_speed(iter/s)": 0.082354 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.26138680103939765, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.703125, + "logps/chosen": -408.6000061035156, + "logps/rejected": -489.79998779296875, + "loss": 0.4444091796875, + "memory(GiB)": 57.72, + "nll_loss": 0.44453126192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 13.925000190734863, + "rewards/rejected": -1.6437499523162842, + "step": 70, + "train_speed(iter/s)": 0.082355 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.44633184477159904, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5499999523162842, + "logps/chosen": -506.3999938964844, + "logps/rejected": -582.4000244140625, + "loss": 0.4694580078125, + "memory(GiB)": 57.72, + "nll_loss": 0.46953123807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.550000190734863, + "rewards/margins": 14.512499809265137, + "rewards/rejected": -1.9695312976837158, + "step": 75, + "train_speed(iter/s)": 0.082702 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.3116745813140644, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.6687500476837158, + "logits/rejected": -1.609375, + "logps/chosen": -433.6000061035156, + "logps/rejected": -563.5999755859375, + "loss": 0.46090087890625, + "memory(GiB)": 57.72, + "nll_loss": 0.4710937440395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.612500190734863, + "rewards/margins": 15.0625, + "rewards/rejected": -2.4468750953674316, + "step": 80, + "train_speed(iter/s)": 0.08211 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.6171875, + "eval_logits/rejected": -1.28125, + "eval_logps/chosen": -130.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.28369140625, + "eval_nll_loss": 0.283203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": 1.5, + "eval_runtime": 4.4281, + "eval_samples_per_second": 0.903, + "eval_steps_per_second": 0.226, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.3778186960664974, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -1.5859375, + "logits/rejected": -1.592187523841858, + "logps/chosen": -384.3999938964844, + "logps/rejected": -660.0, + "loss": 0.4051177978515625, + "memory(GiB)": 57.72, + "nll_loss": 0.40507811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.675000190734863, + "rewards/margins": 13.087499618530273, + "rewards/rejected": -0.42578125, + "step": 85, + "train_speed(iter/s)": 0.082 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.18604144042181714, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -1.5515625476837158, + "logits/rejected": -1.603124976158142, + "logps/chosen": -488.79998779296875, + "logps/rejected": -490.0, + "loss": 0.43151397705078126, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.0, + "rewards/margins": 15.612500190734863, + "rewards/rejected": -1.6233398914337158, + "step": 90, + "train_speed(iter/s)": 0.082606 + }, + { + "epoch": 2.5, + "grad_norm": 0.14745380356896057, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -1.640625, + "logits/rejected": -1.6218750476837158, + "logps/chosen": -521.2000122070312, + "logps/rejected": -601.2000122070312, + "loss": 0.4540283203125, + "memory(GiB)": 57.72, + "nll_loss": 0.45390623807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.8125, + "rewards/margins": 15.625, + "rewards/rejected": -1.818750023841858, + "step": 95, + "train_speed(iter/s)": 0.082454 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.27812231190579917, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.548437476158142, + "logps/chosen": -400.0, + "logps/rejected": -471.20001220703125, + "loss": 0.3977935791015625, + "memory(GiB)": 66.97, + "nll_loss": 0.3980468809604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.837499618530273, + "rewards/margins": 15.225000381469727, + "rewards/rejected": -2.390625, + "step": 100, + "train_speed(iter/s)": 0.082789 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.2734375, + "eval_logps/chosen": -129.0, + "eval_logps/rejected": -1064.0, + "eval_loss": 0.281494140625, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0625, + "eval_rewards/margins": 10.1875, + "eval_rewards/rejected": 0.90234375, + "eval_runtime": 4.383, + "eval_samples_per_second": 0.913, + "eval_steps_per_second": 0.228, + "step": 100 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.47078492381027764, + "learning_rate": 1.70370868554659e-06, + "logits/chosen": -1.6593749523162842, + "logits/rejected": -1.6484375, + "logps/chosen": -436.20001220703125, + "logps/rejected": -444.20001220703125, + "loss": 0.4334808349609375, + "memory(GiB)": 66.97, + "nll_loss": 0.4332031309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.975000381469727, + "rewards/margins": 14.737500190734863, + "rewards/rejected": -1.778906226158142, + "step": 105, + "train_speed(iter/s)": 0.082503 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.27296904355561863, + "learning_rate": 3.380821129028489e-07, + "logits/chosen": -1.6171875, + "logits/rejected": -1.634374976158142, + "logps/chosen": -451.6000061035156, + "logps/rejected": -476.0, + "loss": 0.385760498046875, + "memory(GiB)": 66.97, + "nll_loss": 0.38554686307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 13.912500381469727, + "rewards/margins": 15.737500190734863, + "rewards/rejected": -1.814062476158142, + "step": 110, + "train_speed(iter/s)": 0.082596 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -1.609375, + "eval_logits/rejected": -1.2734375, + "eval_logps/chosen": -129.0, + "eval_logps/rejected": -1064.0, + "eval_loss": 0.28076171875, + "eval_nll_loss": 0.28125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0625, + "eval_rewards/margins": 10.4375, + "eval_rewards/rejected": 0.65234375, + "eval_runtime": 4.3424, + "eval_samples_per_second": 0.921, + "eval_steps_per_second": 0.23, + "step": 114 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 478691537715200.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d8962aa68f2560eb2d58bce122f86d2e090a10d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d742f3f7631106af047816db427d53b44eb15072775949bc3eaebb7355a134d5 +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69ed3ecd382fb6c323e18f1ce1d30a81914cd548 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43106a2677cb9b61cb54869e92d43a49cecb2c73502ce7cd2dd24b73a29272a4 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55c6b3cb4da1f9d6fed60f31474a4ea1d1f27d3a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e3d5eabe3670c104519052c8cf12f5bc873133aaf68ce67d9f19e5d3172ea3 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49595ae5d3ae8d17f5b3d68e66d8e32774393d71 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b98e5552ec1c691d2a8c51211f6ad005a910d9dc685fe69c1396cf416694561b +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00be4832410eeca3380f0078ae47197ab5d0f56d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84227eca8750f313d99927066f98ba6e967c11a29d9697f0cb570e2db7fefa2 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..337cdc4e1f0410c9974bc62e3f3527c3990aa073 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92227888a976a6ecb8da17aac1e52377c10ab98f7b019d69f55d2390f7dd8888 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..622927bb78843368a2d54fd621264c2792a73865 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f60289991348a625693ff0060d468879147e691af40763d3b539666e61c899f +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f1ac5bc162dfe8ad403025a58a7ac4a9d177127 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:218e8d852806fd41d061d867ef1e97d56076025347ea13c0b6da755f8291098c +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c155ce66426a14855f75a59e72cd798639986d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d28fe380654de85bd23edbce17f126fb919cb5d70f8a15351d7fd987b54e511 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..568bd45d5b24b4795d70264d7269368bd244666a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bcd524627b2dad5b16b0a8edb70803106664642a8284c27bbe2dbecbdbed081 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c873e3cc664878fd567a173af654bb6edc01082c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6ad868c9b25cdcce1c572d1af1478c81bb85b133233d84e8fa210ae2cacbcb +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2310c72f6b7216381b678c6f8059c5d6bf503a1f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09658b5a6f4b67d5f95d493b56b7fea6c46219a443fd636e18b9b4f1861bffa7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bd1e6e63a6311e5cad0275b9d98362f2aa1648e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b749026718e43177a907e5c6de2f7f5a629128e982f84b67d4143a3a3070387a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c724345add8910891a1ee1da02a5c9218b8ec91 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2f67d96c86e28092b802f396768141688b61089904cb1315359c2432fb4b76 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b086667f2a457409146668d50c1a293b6eda5f5 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c578445d87812217a3dcaabedcd37902ba6905812ae38dde3ee9642e5be1939 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53ec2782ab6ef279988ab2245a2b1f3d17e17fac --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daa35a6dd309918b376938035b2611b50efee2bbdafd603637075b346da38c86 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..936e00196f3e863de78936cafae49fba3bf50657 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76ad7aa6e9d6acdcab0184b142efaf843b638347e89b34d1ef8a0979ecb9332 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e201402bb36891e48e2b7110304ad87df61a6070 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b40f5e8ba2f299f4eda41d6964ef1f313f53d1f8f687ebd6938ce3242fb4c3 +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6c51f3ec4722e2a1e8e8feab35a41fbebb10ec81 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.38232422, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20", + "epoch": 0.5263157894736842, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 80894479040512.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8f2374479ac02bb2f89c0b353a9401fce9d1308 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97881b64ecfbc2dbff451dd28551a9ead654b35d589eb20d64bcce5b813f98fe +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fafbc8fb0a9342603a7b31644a40e2db69ab335 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c78b887a1818bde44ee6e7b44005bc82a41852b7636c9bbaee89b57472aaac3 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3904ce48c32a78e174fd6243c92df32c98f069c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fe38cf53793c55136acd3b41c55b59fd8ce90f640cd4bb1b45bb8de1bb85241 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb29b128024914ab6d4127a48c65c450b3797058 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fac2637771012219a6056299758c45819cbfbc12ea9b32beab2b4739d1feaeb +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ffa74f5ffde28b4d35a80dc79e30bc9f36e589a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe5287999a2bf1959e7d79fb417d9f6cdf492440615546a22fb51f2e3b5b17a +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf80fcce824d8d891d427560c7ef5d883174d2d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dbe879fb312ec7647b9b646002013be4304e72dbb4ee4c7f421e5f69b0c0c80 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5f0f50fb070da7670adef5bd92444d5115ea9ad --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db0b889b90c8e7d947a3f5d451c0d657c4f320ef76546bd962eb5ae5ff785968 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ac87992857efa7b7818d41c1e7db3712a8ccddf --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd1b1545f90c02651198cabe77bf3ed49e916a7acdb52d503f439f806523ad7 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25750f13a753119a3fa5f0afb9ccfbcf29985aeb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f15d470f63e82bab9058416e11efbe283f08adf196b62e8b1c273c58adb54bfa +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f10785a8506c0ed3a076bf40580b92f8f0d8681f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95c377a651d8312a3cf664ae4964a4d13c7cc983079ff75fb7711576ce0ee804 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..152dbedd594889053f98d4c6ea32455bdfe143ab --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939dd0b0ec6312772b917a3877173aa41e4083bddc429c50679c8fc46fb839c0 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd71b37ec4653685068305918d1c69021fa13fe8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92023b2f1a5933044cee843dca5906d60e1f7779b5100c3af760c5de258b5f9 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59d690bc138d4b4b79439a732622c712387b0912 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d3822b0194c0eae029240a23bcf354f11a9882e2b48c781f6eea432bc5615d +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..046292a63c4b317b4e70290b9590cb0771487b18 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c2b5d6915f15b6d5d29be0db6bbabcaa64fcdc1344f7e1110afd7b3df99b7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58a2c01e32285f08a569aab7bdb7a31399d300e6 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6995110a567575e10c0990db248419a618cf1ec5185331d4ae38f95a15aa9260 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1239e290557fb6040f714627320c9ced1c341554 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98363c93b05b8d5ed2287dcd83721e37a7d46da9e925d569edbc0b9816b57df9 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45dfbcc355f6dd1054aec8495a2bc38ef585f181 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f36a8407ddf51d1ac0711f9287737668b9b311662e4e8a9a14e55eba0862a17 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5b7e2ec90fdb824c8932464c1d9068330655a7 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d8d7722fc72cab6d492b76cb99c8177dcc47544 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c9f84eff30cfa9ea1feedaf262d61fb12e4cba7 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eebfb928f8e91eff0ea1645a20b5aa4465c705b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0866030a266c6d003cc378a9418a723f69e8ab99 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..554638d77107f832d7aa51c61645ee2d6c48a36d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..964331b65172a1bcac03e4673415fa787f724268 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd4754d65217d0f9d1f2d3334397df7a8a079652 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d2abd2d1feb7e9804d318f0409ab46d47248ca5 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0cfcde03016592eed8191f897341f523bbb99d728821c8afed66eae5a64729 +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cef68d5a3420d07144f64a5ba7253b521b548b2a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.29467773, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40", + "epoch": 1.0526315789473684, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.097155209357345, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.576562523841858, + "logps/chosen": -538.7999877929688, + "logps/rejected": -572.2000122070312, + "loss": 0.53927001953125, + "memory(GiB)": 56.27, + "nll_loss": 0.513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412500381469727, + "rewards/margins": 6.025000095367432, + "rewards/rejected": 2.371875047683716, + "step": 25, + "train_speed(iter/s)": 0.08148 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8998878137633866, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.681249976158142, + "logps/chosen": -396.3999938964844, + "logps/rejected": -498.3999938964844, + "loss": 0.46839599609375, + "memory(GiB)": 56.27, + "nll_loss": 0.46757811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.949999809265137, + "rewards/margins": 8.787500381469727, + "rewards/rejected": 0.15253905951976776, + "step": 30, + "train_speed(iter/s)": 0.081373 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4025215648853001, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.665624976158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -610.2000122070312, + "logps/rejected": -542.7999877929688, + "loss": 0.5325042724609375, + "memory(GiB)": 56.27, + "nll_loss": 0.532031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.512499809265137, + "rewards/margins": 10.462499618530273, + "rewards/rejected": -0.964062511920929, + "step": 35, + "train_speed(iter/s)": 0.081424 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.49646039527754704, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.703125, + "logps/chosen": -476.79998779296875, + "logps/rejected": -578.0, + "loss": 0.5092529296875, + "memory(GiB)": 56.27, + "nll_loss": 0.5648437738418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.912500381469727, + "rewards/margins": 11.987500190734863, + "rewards/rejected": -2.090625047683716, + "step": 40, + "train_speed(iter/s)": 0.081711 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.296875, + "eval_logps/chosen": -136.0, + "eval_logps/rejected": -1080.0, + "eval_loss": 0.294677734375, + "eval_nll_loss": 0.294921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.5, + "eval_rewards/margins": 11.1875, + "eval_rewards/rejected": -0.69921875, + "eval_runtime": 4.3578, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 168465840209920.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dae549e82edd97c3ef4e1026062a4c2067e0bb56 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b15828270b5a1f6c280307fa03fdca3313c136b2119e69a316cd33f5ac17634 +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00bbc3e19cd2f718ad7b8ae018aae5aa582cdbc4 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9e1b1fdfd0fea25e263ed843c95b9651b29c5f613a8210e17cf632fca152098 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad634e2084d5cfd83a4a141e60c4bc59bfecc80e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8529d1c78398af0acd4cfc7b952729984d422a1a04a500a1d57c01c1c9dd09f +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..397d4c4914f7f024d2a21a8d44e663070df7a8bf --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e35d0827964b71e91234fd1ab5ce91a70f4cd650605e4190442320d72a538a0f +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5994b4004b7fbaaad54fda3d99eccafca3a8992 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16d76ed17106b0f8a321f3729eba5b7ad544ad740068808472df4e8cc302afa +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14953d583f4478af69d20fa19030ee1d77660bda --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3164ba8a5c24eb94347acb39c85e8a28e34e24095eb11fa98e6030055e79559e +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9541197b239eadbf8efccee132c2d1546503d676 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4992d3004554f31232a1efdd351973d1413c23c334e90ef55dec79e3da6557e +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c550319a77f5b155890c45bfc7e68ef0a3ab410a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8727e8a728275870397a5733f7cda55a3067b4e0ebf399b687eff4d1acaffd40 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61a753dd7b581fe6dc9933fdcb821efa126e4332 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d61c58f29263a2855a18f0ea4f71aaad3c13d7f3277589c5d19ed47ac444f2a1 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95c6e218905abeab034e8449dd8f06fa1d0f4cbb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c510f789fe7f2e5e44e0497c6d898171079c9e499e97e648d757386e596fcb7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd2d323955029e3854755520932fa2e58239e9fc --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ee410eacba2a9e503012c9d54243368cb361631d5e004d8ac15416908c101e +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68dfee0af4f9556fae948009c441882d9a5aab33 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f99fa36acd108040f065a292d841e37c11a7ce4726b19a78984f244fad18ea3 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b8ba85181a89e1e2e2ba2c85c8222cc3ccc6614 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0827bac9c843af334e85b3e16042e9894b0b916f0246d70ed188ddf1c661715e +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..362bfb94833644c0b100db3056a30f8f37bf1de2 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa93e288c611cd5428766325e3cc17f7ed1692250d7f4b76177bd1dcf27e02b +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..973b52304031fdd4e366d311c4b99c11fa3c78dd --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fd8b9299506f3f0206d7f9ef249de581308703e541ef98374e5fd485a0f55c +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a142c8cbafd8df80f115c31badbf471e9f900897 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4492390157bb967268f497670367dc5566ec0777653236e4dbcd37988a7e1755 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..baa7dda86960db5ccb053ebbd916222084554f60 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286322e1a9220435ae78f65bf292242f5ba3473b28c396bbaab7fd06af28098a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24bb2a6ed10249209e94b434ed554cac856d563 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3a6465b9cb557a3a4db2097cdb877b1c624f5f645895d0cd27357a78258aa4 +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f29b49df68bcc51b6fe8f99204a257974b5769a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.28540039, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60", + "epoch": 1.5789473684210527, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.097155209357345, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.576562523841858, + "logps/chosen": -538.7999877929688, + "logps/rejected": -572.2000122070312, + "loss": 0.53927001953125, + "memory(GiB)": 56.27, + "nll_loss": 0.513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412500381469727, + "rewards/margins": 6.025000095367432, + "rewards/rejected": 2.371875047683716, + "step": 25, + "train_speed(iter/s)": 0.08148 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8998878137633866, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.681249976158142, + "logps/chosen": -396.3999938964844, + "logps/rejected": -498.3999938964844, + "loss": 0.46839599609375, + "memory(GiB)": 56.27, + "nll_loss": 0.46757811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.949999809265137, + "rewards/margins": 8.787500381469727, + "rewards/rejected": 0.15253905951976776, + "step": 30, + "train_speed(iter/s)": 0.081373 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4025215648853001, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.665624976158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -610.2000122070312, + "logps/rejected": -542.7999877929688, + "loss": 0.5325042724609375, + "memory(GiB)": 56.27, + "nll_loss": 0.532031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.512499809265137, + "rewards/margins": 10.462499618530273, + "rewards/rejected": -0.964062511920929, + "step": 35, + "train_speed(iter/s)": 0.081424 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.49646039527754704, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.703125, + "logps/chosen": -476.79998779296875, + "logps/rejected": -578.0, + "loss": 0.5092529296875, + "memory(GiB)": 56.27, + "nll_loss": 0.5648437738418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.912500381469727, + "rewards/margins": 11.987500190734863, + "rewards/rejected": -2.090625047683716, + "step": 40, + "train_speed(iter/s)": 0.081711 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.296875, + "eval_logps/chosen": -136.0, + "eval_logps/rejected": -1080.0, + "eval_loss": 0.294677734375, + "eval_nll_loss": 0.294921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.5, + "eval_rewards/margins": 11.1875, + "eval_rewards/rejected": -0.69921875, + "eval_runtime": 4.3578, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4969045495850996, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6015625, + "logps/chosen": -444.6000061035156, + "logps/rejected": -614.0, + "loss": 0.4313232421875, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.699999809265137, + "rewards/margins": 13.237500190734863, + "rewards/rejected": -2.549999952316284, + "step": 45, + "train_speed(iter/s)": 0.080571 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.2582495580909455, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7218749523162842, + "logps/chosen": -496.0, + "logps/rejected": -584.4000244140625, + "loss": 0.4834228515625, + "memory(GiB)": 57.72, + "nll_loss": 0.4828124940395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 13.399999618530273, + "rewards/rejected": -2.135937452316284, + "step": 50, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.1378301574581502, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.584375023841858, + "logits/rejected": -1.6453125476837158, + "logps/chosen": -475.6000061035156, + "logps/rejected": -534.4000244140625, + "loss": 0.46617431640625, + "memory(GiB)": 57.72, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 14.0, + "rewards/rejected": -2.214062452316284, + "step": 55, + "train_speed(iter/s)": 0.08215 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.12374674083229541, + "learning_rate": 5e-05, + "logits/chosen": -1.5671875476837158, + "logits/rejected": -1.6515624523162842, + "logps/chosen": -515.2000122070312, + "logps/rejected": -377.6000061035156, + "loss": 0.42254638671875, + "memory(GiB)": 57.72, + "nll_loss": 0.42304688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.524999618530273, + "rewards/margins": 14.125, + "rewards/rejected": -1.598046898841858, + "step": 60, + "train_speed(iter/s)": 0.082614 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.2890625, + "eval_logps/chosen": -131.0, + "eval_logps/rejected": -1072.0, + "eval_loss": 0.285400390625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": 0.2001953125, + "eval_runtime": 4.4113, + "eval_samples_per_second": 0.907, + "eval_steps_per_second": 0.227, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 252827512340480.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/README.md b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb68a7d6caeadf83c94bf41a98174b4994de1bfb --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-32b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a869ebfcab317c3d514f62adbd5d078fd257a39a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "up_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_model.safetensors b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ed058591ea4eff5d67d1130c3ad405c322c6348 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cc7d6b5b6154a75f2186b50b1c611dd756f8b4dae1e252e5531f036ace01001 +size 134337704 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/additional_config.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/args.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe933211dec07797d29e82d5866d38722980d87 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-32b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-32b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe62eb2870c6f26795806ef5978eefaad297699a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1889859a10d8229ed75dc7e3720bb36fad3b0baf54ee5cae8deb7c60398c382b +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f7b58835d9870b232e6cb08f7d0cd28fd7d81a6 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e755aefb3fcd145d7a91faf0fd2d52b1e45b9b6783335e278ee23c4a801c90 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c930097525d56951aba09d6b007d501799b5d32c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56365b816dbfbd72ce76801f7b6e4883bc12ac4624d8f6b480abe0c01108da1f +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92934498590022b19a11ae253278628f839ce4b8 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:367d2653eb51e4821d8d1baf468f39b1e8df4ef54053a3d7f633b0def646a144 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c848add0cf52cf8846c1a823228aafc6c376a92 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81449bec4e4942c9a876d91cb6a579a65a2781c4c3c0bc61c123a7bd056b9d54 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b32ad05d4aecac0a0eef2aaf37c51028df0a7c7 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:422f457532c911f496ddc5dd0c47c878399b3279b85ddee203173a5fcc2b1aa8 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..616cc7ed1944c3f88cc41390926288c6e4d49fe3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07beeffd068d829d4b4bd6fcfef92354d725105c4a906aa2a3d01a8111ea50da +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a3c673c848363e7dcf2fa93eb3b160b2c176057 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25118358aa16d43011d3eee6349cecf6f76c2b86c4e90f286374cda0eb393c56 +size 100667312 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84f6e05655138cb2c183077640365bbdb4d53984 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f36d9b56d77055f4f1455f7dbba9bef4402905acf28686157836702329d45a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d47af3d97f6b0e26c2c92bf7dd281ad184f4dee3 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06f7d1f9bbd4a2c78ed38a2087cb38ecdb6e8260c7318a6d76920b2dc1261e32 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f56de979efa68b215b4eead190912770fe5dea33 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0d43d9513d732b53c7ec45ce13e033bd48cf37dd35e17b71a3b698f960fe37 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dce5eda351453c8a351aa5379b8f519f8291bc38 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e88dd8b692def807027787e5975f23ec8bec98f15ac6f1f2753dccf64c6f98a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9337d6678ad945148a0b7d566151166d9056770a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81bfc06d39b12eb5f9736867ace7484df1b7f11843c1a3fc7933303d58736a03 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..868b6b65b8b93da7dfb3eb35852eb5baf3cd2715 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746f3dbbe62e70fe59fef4f91064c45a4351929431dad77b3aa9a0fe8ce82df7 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb315f9ade2849d0d5f5d4549949f173d0f7305d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10a8daf139a8e67152e96758cf6a94bdd58c880092af4299205e60bdd19f283a +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8312aa35b8dd8dd914e61d5b20963f3f5a5dce54 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301c209ab202ee9b6b96f1857aa3a9cfcbf8f1b971f20e9f8d21d0ca0f959b93 +size 886254 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/latest b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_0.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_1.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_2.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_3.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_4.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_5.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_6.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_7.pth b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/scheduler.pt b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf96d6803aea265d756d902db3c4cc2386f9742 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90524bcdb94734ac7120e4205110f14662bff8cee00eed50355875dcdc538029 +size 1064 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/trainer_state.json b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d6417a64f350f1d53bbaaca0921e475383b7b32 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.28369141, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80", + "epoch": 2.1052631578947367, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 4.865237153661176, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": -1.625, + "logits/rejected": -1.6015625, + "logps/chosen": -664.0, + "logps/rejected": -243.0, + "loss": 1.279296875, + "memory(GiB)": 14.31, + "nll_loss": 0.58984375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.060374 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 8.264469185177074, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -1.572265625, + "logits/rejected": -1.572265625, + "logps/chosen": -676.25, + "logps/rejected": -368.0, + "loss": 1.886962890625, + "memory(GiB)": 24.25, + "nll_loss": 1.1826171875, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.0015869140625, + "rewards/margins": -0.00927734375, + "rewards/rejected": 0.01096343994140625, + "step": 5, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 4.869686740353647, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": -1.506250023841858, + "logits/rejected": -1.564062476158142, + "logps/chosen": -566.7999877929688, + "logps/rejected": -591.7999877929688, + "loss": 1.74365234375, + "memory(GiB)": 56.27, + "nll_loss": 1.2609374523162842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.384374976158142, + "rewards/margins": 0.6645263433456421, + "rewards/rejected": 0.719042956829071, + "step": 10, + "train_speed(iter/s)": 0.078572 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.219941119836901, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -1.592187523841858, + "logits/rejected": -1.670312523841858, + "logps/chosen": -587.2000122070312, + "logps/rejected": -406.6000061035156, + "loss": 0.95322265625, + "memory(GiB)": 56.27, + "nll_loss": 0.7054687738418579, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 4.762499809265137, + "rewards/margins": 3.004687547683716, + "rewards/rejected": 1.764062523841858, + "step": 15, + "train_speed(iter/s)": 0.08295 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.2412220923276367, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -1.6375000476837158, + "logits/rejected": -1.610937476158142, + "logps/chosen": -369.0, + "logps/rejected": -490.0, + "loss": 0.7086181640625, + "memory(GiB)": 56.27, + "nll_loss": 0.616406261920929, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.375, + "rewards/margins": 3.456249952316284, + "rewards/rejected": 3.924999952316284, + "step": 20, + "train_speed(iter/s)": 0.083066 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.3046875, + "eval_logps/chosen": -147.0, + "eval_logps/rejected": -1020.0, + "eval_loss": 0.38232421875, + "eval_nll_loss": 0.33203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 9.3125, + "eval_rewards/margins": 4.15625, + "eval_rewards/rejected": 5.15625, + "eval_runtime": 4.3657, + "eval_samples_per_second": 0.916, + "eval_steps_per_second": 0.229, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.097155209357345, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -1.626562476158142, + "logits/rejected": -1.576562523841858, + "logps/chosen": -538.7999877929688, + "logps/rejected": -572.2000122070312, + "loss": 0.53927001953125, + "memory(GiB)": 56.27, + "nll_loss": 0.513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412500381469727, + "rewards/margins": 6.025000095367432, + "rewards/rejected": 2.371875047683716, + "step": 25, + "train_speed(iter/s)": 0.08148 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8998878137633866, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.681249976158142, + "logps/chosen": -396.3999938964844, + "logps/rejected": -498.3999938964844, + "loss": 0.46839599609375, + "memory(GiB)": 56.27, + "nll_loss": 0.46757811307907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.949999809265137, + "rewards/margins": 8.787500381469727, + "rewards/rejected": 0.15253905951976776, + "step": 30, + "train_speed(iter/s)": 0.081373 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4025215648853001, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -1.665624976158142, + "logits/rejected": -1.6828124523162842, + "logps/chosen": -610.2000122070312, + "logps/rejected": -542.7999877929688, + "loss": 0.5325042724609375, + "memory(GiB)": 56.27, + "nll_loss": 0.532031238079071, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.512499809265137, + "rewards/margins": 10.462499618530273, + "rewards/rejected": -0.964062511920929, + "step": 35, + "train_speed(iter/s)": 0.081424 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.49646039527754704, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -1.6437499523162842, + "logits/rejected": -1.703125, + "logps/chosen": -476.79998779296875, + "logps/rejected": -578.0, + "loss": 0.5092529296875, + "memory(GiB)": 56.27, + "nll_loss": 0.5648437738418579, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.912500381469727, + "rewards/margins": 11.987500190734863, + "rewards/rejected": -2.090625047683716, + "step": 40, + "train_speed(iter/s)": 0.081711 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.625, + "eval_logits/rejected": -1.296875, + "eval_logps/chosen": -136.0, + "eval_logps/rejected": -1080.0, + "eval_loss": 0.294677734375, + "eval_nll_loss": 0.294921875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.5, + "eval_rewards/margins": 11.1875, + "eval_rewards/rejected": -0.69921875, + "eval_runtime": 4.3578, + "eval_samples_per_second": 0.918, + "eval_steps_per_second": 0.229, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.4969045495850996, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -1.6328125, + "logits/rejected": -1.6015625, + "logps/chosen": -444.6000061035156, + "logps/rejected": -614.0, + "loss": 0.4313232421875, + "memory(GiB)": 57.72, + "nll_loss": 0.43085938692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.699999809265137, + "rewards/margins": 13.237500190734863, + "rewards/rejected": -2.549999952316284, + "step": 45, + "train_speed(iter/s)": 0.080571 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.2582495580909455, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": -1.703125, + "logits/rejected": -1.7218749523162842, + "logps/chosen": -496.0, + "logps/rejected": -584.4000244140625, + "loss": 0.4834228515625, + "memory(GiB)": 57.72, + "nll_loss": 0.4828124940395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.25, + "rewards/margins": 13.399999618530273, + "rewards/rejected": -2.135937452316284, + "step": 50, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.1378301574581502, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -1.584375023841858, + "logits/rejected": -1.6453125476837158, + "logps/chosen": -475.6000061035156, + "logps/rejected": -534.4000244140625, + "loss": 0.46617431640625, + "memory(GiB)": 57.72, + "nll_loss": 0.466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.800000190734863, + "rewards/margins": 14.0, + "rewards/rejected": -2.214062452316284, + "step": 55, + "train_speed(iter/s)": 0.08215 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.12374674083229541, + "learning_rate": 5e-05, + "logits/chosen": -1.5671875476837158, + "logits/rejected": -1.6515624523162842, + "logps/chosen": -515.2000122070312, + "logps/rejected": -377.6000061035156, + "loss": 0.42254638671875, + "memory(GiB)": 57.72, + "nll_loss": 0.42304688692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.524999618530273, + "rewards/margins": 14.125, + "rewards/rejected": -1.598046898841858, + "step": 60, + "train_speed(iter/s)": 0.082614 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.6328125, + "eval_logits/rejected": -1.2890625, + "eval_logps/chosen": -131.0, + "eval_logps/rejected": -1072.0, + "eval_loss": 0.285400390625, + "eval_nll_loss": 0.28515625, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.875, + "eval_rewards/margins": 10.6875, + "eval_rewards/rejected": 0.2001953125, + "eval_runtime": 4.4113, + "eval_samples_per_second": 0.907, + "eval_steps_per_second": 0.227, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.25020620326140985, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -1.6765625476837158, + "logits/rejected": -1.5906250476837158, + "logps/chosen": -453.20001220703125, + "logps/rejected": -469.6000061035156, + "loss": 0.399969482421875, + "memory(GiB)": 57.72, + "nll_loss": 0.39921873807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.475000381469727, + "rewards/margins": 14.649999618530273, + "rewards/rejected": -2.171875, + "step": 65, + "train_speed(iter/s)": 0.082354 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.26138680103939765, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -1.640625, + "logits/rejected": -1.703125, + "logps/chosen": -408.6000061035156, + "logps/rejected": -489.79998779296875, + "loss": 0.4444091796875, + "memory(GiB)": 57.72, + "nll_loss": 0.44453126192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 13.925000190734863, + "rewards/rejected": -1.6437499523162842, + "step": 70, + "train_speed(iter/s)": 0.082355 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.44633184477159904, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -1.6015625, + "logits/rejected": -1.5499999523162842, + "logps/chosen": -506.3999938964844, + "logps/rejected": -582.4000244140625, + "loss": 0.4694580078125, + "memory(GiB)": 57.72, + "nll_loss": 0.46953123807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.550000190734863, + "rewards/margins": 14.512499809265137, + "rewards/rejected": -1.9695312976837158, + "step": 75, + "train_speed(iter/s)": 0.082702 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.3116745813140644, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -1.6687500476837158, + "logits/rejected": -1.609375, + "logps/chosen": -433.6000061035156, + "logps/rejected": -563.5999755859375, + "loss": 0.46090087890625, + "memory(GiB)": 57.72, + "nll_loss": 0.4710937440395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.612500190734863, + "rewards/margins": 15.0625, + "rewards/rejected": -2.4468750953674316, + "step": 80, + "train_speed(iter/s)": 0.08211 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.6171875, + "eval_logits/rejected": -1.28125, + "eval_logps/chosen": -130.0, + "eval_logps/rejected": -1056.0, + "eval_loss": 0.28369140625, + "eval_nll_loss": 0.283203125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 11.0, + "eval_rewards/margins": 9.5, + "eval_rewards/rejected": 1.5, + "eval_runtime": 4.4281, + "eval_samples_per_second": 0.903, + "eval_steps_per_second": 0.226, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 338272376061952.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/training_args.bin b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..70d93cb167733b03530541b24294d8d696f9d63f --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52168d9a8e1df75ad4897f691add30ba782a6014e94426406b369d12e504dd11 +size 9016 diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/zero_to_fp32.py b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..ef6f75ce1114946a517a919a39a3bc6fdb3192a7 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..7770d2aed4cacaa78c0bfd62e1f76d6e20850338 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logits_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..581816afc8cfcd75e73576fce4f0d6a26ce8340f Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..8a6ee02e6c09db15c93e8ee0a6dc480a67835054 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_logps_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_loss.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..acefa0d4b64ef7d5a2fc699fc67db6bf6837be47 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_loss.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_nll_loss.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..d161d93a6e548362b3fd68b24ddb1696dc1f31a5 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_nll_loss.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_accuracies.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..b359853ac747638a7e44efa30e8bcf81f3befcfc Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_accuracies.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..e9128c5e775998d5c9258e8ddc00985c0b80addb Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_margins.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..d7bea2bdd31563f1cf3aa07c8155207ac2ce75b4 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_margins.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..598082980b5fb619df21a2696d8d812ce45abc37 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_rewards_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_runtime.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..278109a4bed343d610c6765084c93ed64fd356a5 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_runtime.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_samples_per_second.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..b004824c98230f0032e73ae74993ab9dc4de6962 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_samples_per_second.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_steps_per_second.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..41e9b50a4d08f4d71766bf8b3b9044d6e8a09029 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/eval_steps_per_second.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_epoch.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..c152c9bbb1100d680125e899d3d0da3db98dc221 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_epoch.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_grad_norm.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..1d20671dd6ff47e46373d99a482d706ad794756d Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_grad_norm.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_learning_rate.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..e35e42c173f451f9666ba448872c615528441c9e Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_learning_rate.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..26998c921d4e6d7a947e9632f8cc760054bf54d8 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..b9e84c62faff2e50a953cca4f32cb7e97014bf75 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logits_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..084aadeacd0992eb05b0694ef904af66a2b15ebc Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..ea7dd7cddfc15218781f7c5c451ec9661e695af0 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_logps_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_loss.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b22d6b6ee0f06967b22debbc6cf5bbff0aab4d01 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_loss.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_memory(GiB).png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..979c51fa2e393bd30db435c7d29a0503dc1d8450 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_memory(GiB).png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_nll_loss.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..38c968f7eca0eb9e57e2c50f293ae3b538e89e06 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_nll_loss.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_accuracies.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..484ef5a9ff73fbf3953b4db6b8e645f013ba5724 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_accuracies.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_chosen.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..d18c259f3201afdeb8c3051d150b7eee2d141d01 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_chosen.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_margins.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..704f3caeca252225bf1b367ea867014deec3dc8f Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_margins.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_rejected.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..6b12aac45c596a0e4951eaa4ccc92b38abbd103c Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_rewards_rejected.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_total_flos.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..f499d516ab3f9b110c1f51a7203694510c1c48ed Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_total_flos.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_loss.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b98210380e236361dad700b49a8f668f14852554 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_loss.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_runtime.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..42b40d4e2aa37242c3d7cefa07acb000c9b65620 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_runtime.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_samples_per_second.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..3075b8d8416237e4ab412ee43805fb74417e28e2 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_samples_per_second.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_speed(iter_s).png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..942a898e95bf2dddcc257efa44f5bc20b3024dd9 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_speed(iter_s).png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_steps_per_second.png b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..e8798f373d18bba4a1d1c9d262d5dacd0bccf5f0 Binary files /dev/null and b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/images/train_train_steps_per_second.png differ diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/logging.jsonl b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7736592a1803d5afe1f5391d5aa00746219dcff5 --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/logging.jsonl @@ -0,0 +1,31 @@ +{"loss": 1.27929688, "grad_norm": 4.86523715, "learning_rate": 1.667e-05, "memory(GiB)": 14.31, "train_speed(iter/s)": 0.060374, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -664.0, "logps/rejected": -243.0, "logits/chosen": -1.625, "logits/rejected": -1.6015625, "nll_loss": 0.58984375, "epoch": 0.02631579, "global_step/max_steps": "1/114", "percentage": "0.88%", "elapsed_time": "12s", "remaining_time": "23m 23s"} +{"loss": 1.88696289, "grad_norm": 8.26446919, "learning_rate": 8.333e-05, "memory(GiB)": 24.25, "train_speed(iter/s)": 0.080912, "rewards/chosen": 0.00158691, "rewards/rejected": 0.01096344, "rewards/accuracies": 0.1875, "rewards/margins": -0.00927734, "logps/chosen": -676.25, "logps/rejected": -368.0, "logits/chosen": -1.57226562, "logits/rejected": -1.57226562, "nll_loss": 1.18261719, "epoch": 0.13157895, "global_step/max_steps": "5/114", "percentage": "4.39%", "elapsed_time": "57s", "remaining_time": "20m 57s"} +{"loss": 1.74365234, "grad_norm": 4.86968674, "learning_rate": 9.966e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.078572, "rewards/chosen": 1.38437498, "rewards/rejected": 0.71904296, "rewards/accuracies": 0.77499998, "rewards/margins": 0.66452634, "logps/chosen": -566.79998779, "logps/rejected": -591.79998779, "logits/chosen": -1.50625002, "logits/rejected": -1.56406248, "nll_loss": 1.26093745, "epoch": 0.26315789, "global_step/max_steps": "10/114", "percentage": "8.77%", "elapsed_time": "2m 3s", "remaining_time": "21m 20s"} +{"loss": 0.95322266, "grad_norm": 1.21994112, "learning_rate": 9.83e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.08295, "rewards/chosen": 4.76249981, "rewards/rejected": 1.76406252, "rewards/accuracies": 0.89999998, "rewards/margins": 3.00468755, "logps/chosen": -587.20001221, "logps/rejected": -406.6000061, "logits/chosen": -1.59218752, "logits/rejected": -1.67031252, "nll_loss": 0.70546877, "epoch": 0.39473684, "global_step/max_steps": "15/114", "percentage": "13.16%", "elapsed_time": "2m 56s", "remaining_time": "19m 26s"} +{"loss": 0.70861816, "grad_norm": 2.24122209, "learning_rate": 9.591e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.083066, "rewards/chosen": 7.375, "rewards/rejected": 3.92499995, "rewards/accuracies": 1.0, "rewards/margins": 3.45624995, "logps/chosen": -369.0, "logps/rejected": -490.0, "logits/chosen": -1.63750005, "logits/rejected": -1.61093748, "nll_loss": 0.61640626, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "3m 56s", "remaining_time": "18m 32s"} +{"eval_loss": 0.38232422, "eval_runtime": 4.3657, "eval_samples_per_second": 0.916, "eval_steps_per_second": 0.229, "eval_rewards/chosen": 9.3125, "eval_rewards/rejected": 5.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.15625, "eval_logps/chosen": -147.0, "eval_logps/rejected": -1020.0, "eval_logits/chosen": -1.625, "eval_logits/rejected": -1.3046875, "eval_nll_loss": 0.33203125, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "4m 1s", "remaining_time": "18m 52s"} +{"loss": 0.53927002, "grad_norm": 1.09715521, "learning_rate": 9.256e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.08148, "rewards/chosen": 8.41250038, "rewards/rejected": 2.37187505, "rewards/accuracies": 1.0, "rewards/margins": 6.0250001, "logps/chosen": -538.79998779, "logps/rejected": -572.20001221, "logits/chosen": -1.62656248, "logits/rejected": -1.57656252, "nll_loss": 0.51367188, "epoch": 0.65789474, "global_step/max_steps": "25/114", "percentage": "21.93%", "elapsed_time": "5m 2s", "remaining_time": "17m 57s"} +{"loss": 0.468396, "grad_norm": 0.89988781, "learning_rate": 8.83e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081373, "rewards/chosen": 8.94999981, "rewards/rejected": 0.15253906, "rewards/accuracies": 1.0, "rewards/margins": 8.78750038, "logps/chosen": -396.3999939, "logps/rejected": -498.3999939, "logits/chosen": -1.640625, "logits/rejected": -1.68124998, "nll_loss": 0.46757811, "epoch": 0.78947368, "global_step/max_steps": "30/114", "percentage": "26.32%", "elapsed_time": "6m 4s", "remaining_time": "17m 0s"} +{"loss": 0.53250427, "grad_norm": 0.40252156, "learning_rate": 8.324e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081424, "rewards/chosen": 9.51249981, "rewards/rejected": -0.96406251, "rewards/accuracies": 1.0, "rewards/margins": 10.46249962, "logps/chosen": -610.20001221, "logps/rejected": -542.79998779, "logits/chosen": -1.66562498, "logits/rejected": -1.68281245, "nll_loss": 0.53203124, "epoch": 0.92105263, "global_step/max_steps": "35/114", "percentage": "30.70%", "elapsed_time": "7m 5s", "remaining_time": "16m 0s"} +{"loss": 0.50925293, "grad_norm": 0.4964604, "learning_rate": 7.748e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081711, "rewards/chosen": 9.91250038, "rewards/rejected": -2.09062505, "rewards/accuracies": 1.0, "rewards/margins": 11.98750019, "logps/chosen": -476.79998779, "logps/rejected": -578.0, "logits/chosen": -1.64374995, "logits/rejected": -1.703125, "nll_loss": 0.56484377, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "8m 5s", "remaining_time": "14m 57s"} +{"eval_loss": 0.29467773, "eval_runtime": 4.3578, "eval_samples_per_second": 0.918, "eval_steps_per_second": 0.229, "eval_rewards/chosen": 10.5, "eval_rewards/rejected": -0.69921875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.1875, "eval_logps/chosen": -136.0, "eval_logps/rejected": -1080.0, "eval_logits/chosen": -1.625, "eval_logits/rejected": -1.296875, "eval_nll_loss": 0.29492188, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "8m 9s", "remaining_time": "15m 6s"} +{"loss": 0.43132324, "grad_norm": 0.49690455, "learning_rate": 7.113e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.080571, "rewards/chosen": 10.69999981, "rewards/rejected": -2.54999995, "rewards/accuracies": 1.0, "rewards/margins": 13.23750019, "logps/chosen": -444.6000061, "logps/rejected": -614.0, "logits/chosen": -1.6328125, "logits/rejected": -1.6015625, "nll_loss": 0.43085939, "epoch": 1.18421053, "global_step/max_steps": "45/114", "percentage": "39.47%", "elapsed_time": "9m 14s", "remaining_time": "14m 10s"} +{"loss": 0.48342285, "grad_norm": 0.25824956, "learning_rate": 6.434e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.080901, "rewards/chosen": 11.25, "rewards/rejected": -2.13593745, "rewards/accuracies": 1.0, "rewards/margins": 13.39999962, "logps/chosen": -496.0, "logps/rejected": -584.40002441, "logits/chosen": -1.703125, "logits/rejected": -1.72187495, "nll_loss": 0.48281249, "epoch": 1.31578947, "global_step/max_steps": "50/114", "percentage": "43.86%", "elapsed_time": "10m 13s", "remaining_time": "13m 5s"} +{"loss": 0.46617432, "grad_norm": 0.13783016, "learning_rate": 5.725e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.08215, "rewards/chosen": 11.80000019, "rewards/rejected": -2.21406245, "rewards/accuracies": 1.0, "rewards/margins": 14.0, "logps/chosen": -475.6000061, "logps/rejected": -534.40002441, "logits/chosen": -1.58437502, "logits/rejected": -1.64531255, "nll_loss": 0.46679688, "epoch": 1.44736842, "global_step/max_steps": "55/114", "percentage": "48.25%", "elapsed_time": "11m 5s", "remaining_time": "11m 53s"} +{"loss": 0.42254639, "grad_norm": 0.12374674, "learning_rate": 5e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082614, "rewards/chosen": 12.52499962, "rewards/rejected": -1.5980469, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/chosen": -515.20001221, "logps/rejected": -377.6000061, "logits/chosen": -1.56718755, "logits/rejected": -1.65156245, "nll_loss": 0.42304689, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "12m 2s", "remaining_time": "10m 49s"} +{"eval_loss": 0.28540039, "eval_runtime": 4.4113, "eval_samples_per_second": 0.907, "eval_steps_per_second": 0.227, "eval_rewards/chosen": 10.875, "eval_rewards/rejected": 0.20019531, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/chosen": -131.0, "eval_logps/rejected": -1072.0, "eval_logits/chosen": -1.6328125, "eval_logits/rejected": -1.2890625, "eval_nll_loss": 0.28515625, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "12m 6s", "remaining_time": "10m 53s"} +{"loss": 0.39996948, "grad_norm": 0.2502062, "learning_rate": 4.275e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082354, "rewards/chosen": 12.47500038, "rewards/rejected": -2.171875, "rewards/accuracies": 1.0, "rewards/margins": 14.64999962, "logps/chosen": -453.20001221, "logps/rejected": -469.6000061, "logits/chosen": -1.67656255, "logits/rejected": -1.59062505, "nll_loss": 0.39921874, "epoch": 1.71052632, "global_step/max_steps": "65/114", "percentage": "57.02%", "elapsed_time": "13m 5s", "remaining_time": "9m 51s"} +{"loss": 0.44440918, "grad_norm": 0.2613868, "learning_rate": 3.566e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082355, "rewards/chosen": 12.28750038, "rewards/rejected": -1.64374995, "rewards/accuracies": 1.0, "rewards/margins": 13.92500019, "logps/chosen": -408.6000061, "logps/rejected": -489.79998779, "logits/chosen": -1.640625, "logits/rejected": -1.703125, "nll_loss": 0.44453126, "epoch": 1.84210526, "global_step/max_steps": "70/114", "percentage": "61.40%", "elapsed_time": "14m 5s", "remaining_time": "8m 51s"} +{"loss": 0.46945801, "grad_norm": 0.44633184, "learning_rate": 2.887e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082702, "rewards/chosen": 12.55000019, "rewards/rejected": -1.9695313, "rewards/accuracies": 1.0, "rewards/margins": 14.51249981, "logps/chosen": -506.3999939, "logps/rejected": -582.40002441, "logits/chosen": -1.6015625, "logits/rejected": -1.54999995, "nll_loss": 0.46953124, "epoch": 1.97368421, "global_step/max_steps": "75/114", "percentage": "65.79%", "elapsed_time": "15m 2s", "remaining_time": "7m 49s"} +{"loss": 0.46090088, "grad_norm": 0.31167458, "learning_rate": 2.252e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.08211, "rewards/chosen": 12.61250019, "rewards/rejected": -2.4468751, "rewards/accuracies": 1.0, "rewards/margins": 15.0625, "logps/chosen": -433.6000061, "logps/rejected": -563.59997559, "logits/chosen": -1.66875005, "logits/rejected": -1.609375, "nll_loss": 0.47109374, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "16m 10s", "remaining_time": "6m 52s"} +{"eval_loss": 0.28369141, "eval_runtime": 4.4281, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.226, "eval_rewards/chosen": 11.0, "eval_rewards/rejected": 1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/chosen": -130.0, "eval_logps/rejected": -1056.0, "eval_logits/chosen": -1.6171875, "eval_logits/rejected": -1.28125, "eval_nll_loss": 0.28320312, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "16m 14s", "remaining_time": "6m 54s"} +{"loss": 0.4051178, "grad_norm": 0.3778187, "learning_rate": 1.676e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082, "rewards/chosen": 12.67500019, "rewards/rejected": -0.42578125, "rewards/accuracies": 1.0, "rewards/margins": 13.08749962, "logps/chosen": -384.3999939, "logps/rejected": -660.0, "logits/chosen": -1.5859375, "logits/rejected": -1.59218752, "nll_loss": 0.40507811, "epoch": 2.23684211, "global_step/max_steps": "85/114", "percentage": "74.56%", "elapsed_time": "17m 12s", "remaining_time": "5m 52s"} +{"loss": 0.43151398, "grad_norm": 0.18604144, "learning_rate": 1.17e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082606, "rewards/chosen": 14.0, "rewards/rejected": -1.62333989, "rewards/accuracies": 1.0, "rewards/margins": 15.61250019, "logps/chosen": -488.79998779, "logps/rejected": -490.0, "logits/chosen": -1.55156255, "logits/rejected": -1.60312498, "nll_loss": 0.43085939, "epoch": 2.36842105, "global_step/max_steps": "90/114", "percentage": "78.95%", "elapsed_time": "18m 5s", "remaining_time": "4m 49s"} +{"loss": 0.45402832, "grad_norm": 0.1474538, "learning_rate": 7.44e-06, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082454, "rewards/chosen": 13.8125, "rewards/rejected": -1.81875002, "rewards/accuracies": 1.0, "rewards/margins": 15.625, "logps/chosen": -521.20001221, "logps/rejected": -601.20001221, "logits/chosen": -1.640625, "logits/rejected": -1.62187505, "nll_loss": 0.45390624, "epoch": 2.5, "global_step/max_steps": "95/114", "percentage": "83.33%", "elapsed_time": "19m 8s", "remaining_time": "3m 49s"} +{"loss": 0.39779358, "grad_norm": 0.27812231, "learning_rate": 4.09e-06, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082789, "rewards/chosen": 12.83749962, "rewards/rejected": -2.390625, "rewards/accuracies": 1.0, "rewards/margins": 15.22500038, "logps/chosen": -400.0, "logps/rejected": -471.20001221, "logits/chosen": -1.62656248, "logits/rejected": -1.54843748, "nll_loss": 0.39804688, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "20m 3s", "remaining_time": "2m 48s"} +{"eval_loss": 0.28149414, "eval_runtime": 4.383, "eval_samples_per_second": 0.913, "eval_steps_per_second": 0.228, "eval_rewards/chosen": 11.0625, "eval_rewards/rejected": 0.90234375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.1875, "eval_logps/chosen": -129.0, "eval_logps/rejected": -1064.0, "eval_logits/chosen": -1.609375, "eval_logits/rejected": -1.2734375, "eval_nll_loss": 0.28125, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "20m 8s", "remaining_time": "2m 49s"} +{"loss": 0.43348083, "grad_norm": 0.47078492, "learning_rate": 1.7e-06, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082503, "rewards/chosen": 12.97500038, "rewards/rejected": -1.77890623, "rewards/accuracies": 1.0, "rewards/margins": 14.73750019, "logps/chosen": -436.20001221, "logps/rejected": -444.20001221, "logits/chosen": -1.65937495, "logits/rejected": -1.6484375, "nll_loss": 0.43320313, "epoch": 2.76315789, "global_step/max_steps": "105/114", "percentage": "92.11%", "elapsed_time": "21m 8s", "remaining_time": "1m 48s"} +{"loss": 0.3857605, "grad_norm": 0.27296904, "learning_rate": 3.4e-07, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082596, "rewards/chosen": 13.91250038, "rewards/rejected": -1.81406248, "rewards/accuracies": 1.0, "rewards/margins": 15.73750019, "logps/chosen": -451.6000061, "logps/rejected": -476.0, "logits/chosen": -1.6171875, "logits/rejected": -1.63437498, "nll_loss": 0.38554686, "epoch": 2.89473684, "global_step/max_steps": "110/114", "percentage": "96.49%", "elapsed_time": "22m 7s", "remaining_time": "48s"} +{"eval_loss": 0.28076172, "eval_runtime": 4.3424, "eval_samples_per_second": 0.921, "eval_steps_per_second": 0.23, "eval_rewards/chosen": 11.0625, "eval_rewards/rejected": 0.65234375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.4375, "eval_logps/chosen": -129.0, "eval_logps/rejected": -1064.0, "eval_logits/chosen": -1.609375, "eval_logits/rejected": -1.2734375, "eval_nll_loss": 0.28125, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "23m 5s", "remaining_time": "0s"} +{"train_runtime": 1388.2018, "train_samples_per_second": 0.644, "train_steps_per_second": 0.082, "total_flos": 478691537715200.0, "train_loss": 0.60841537, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "23m 8s", "remaining_time": "0s"} +{"train_dataset": "1695.382550±899.293489, min=182.000000, max=4081.000000, size=298", "val_dataset": "1637.250000±797.581461, min=755.000000, max=2485.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 32830.9852M Params (67.1089M Trainable [0.2044%]), 0.0001M Buffers.", "last_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114", "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/checkpoint-114", "best_metric": 0.28076172, "global_step": 114, "log_history": [{"loss": 1.279296875, "grad_norm": 4.865237153661176, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 14.31, "train_speed(iter/s)": 0.060374, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -664.0, "logps/rejected": -243.0, "logits/chosen": -1.625, "logits/rejected": -1.6015625, "nll_loss": 0.58984375, "epoch": 0.02631578947368421, "step": 1}, {"loss": 1.886962890625, "grad_norm": 8.264469185177074, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 24.25, "train_speed(iter/s)": 0.080912, "rewards/chosen": 0.0015869140625, "rewards/rejected": 0.01096343994140625, "rewards/accuracies": 0.1875, "rewards/margins": -0.00927734375, "logps/chosen": -676.25, "logps/rejected": -368.0, "logits/chosen": -1.572265625, "logits/rejected": -1.572265625, "nll_loss": 1.1826171875, "epoch": 0.13157894736842105, "step": 5}, {"loss": 1.74365234375, "grad_norm": 4.869686740353647, "learning_rate": 9.966191788709716e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.078572, "rewards/chosen": 1.384374976158142, "rewards/rejected": 0.719042956829071, "rewards/accuracies": 0.7749999761581421, "rewards/margins": 0.6645263433456421, "logps/chosen": -566.7999877929688, "logps/rejected": -591.7999877929688, "logits/chosen": -1.506250023841858, "logits/rejected": -1.564062476158142, "nll_loss": 1.2609374523162842, "epoch": 0.2631578947368421, "step": 10}, {"loss": 0.95322265625, "grad_norm": 1.219941119836901, "learning_rate": 9.829629131445342e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.08295, "rewards/chosen": 4.762499809265137, "rewards/rejected": 1.764062523841858, "rewards/accuracies": 0.8999999761581421, "rewards/margins": 3.004687547683716, "logps/chosen": -587.2000122070312, "logps/rejected": -406.6000061035156, "logits/chosen": -1.592187523841858, "logits/rejected": -1.670312523841858, "nll_loss": 0.7054687738418579, "epoch": 0.39473684210526316, "step": 15}, {"loss": 0.7086181640625, "grad_norm": 2.2412220923276367, "learning_rate": 9.591080534401371e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.083066, "rewards/chosen": 7.375, "rewards/rejected": 3.924999952316284, "rewards/accuracies": 1.0, "rewards/margins": 3.456249952316284, "logps/chosen": -369.0, "logps/rejected": -490.0, "logits/chosen": -1.6375000476837158, "logits/rejected": -1.610937476158142, "nll_loss": 0.616406261920929, "epoch": 0.5263157894736842, "step": 20}, {"eval_loss": 0.38232421875, "eval_runtime": 4.3657, "eval_samples_per_second": 0.916, "eval_steps_per_second": 0.229, "eval_rewards/chosen": 9.3125, "eval_rewards/rejected": 5.15625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.15625, "eval_logps/chosen": -147.0, "eval_logps/rejected": -1020.0, "eval_logits/chosen": -1.625, "eval_logits/rejected": -1.3046875, "eval_nll_loss": 0.33203125, "epoch": 0.5263157894736842, "step": 20}, {"loss": 0.53927001953125, "grad_norm": 1.097155209357345, "learning_rate": 9.255583362184999e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.08148, "rewards/chosen": 8.412500381469727, "rewards/rejected": 2.371875047683716, "rewards/accuracies": 1.0, "rewards/margins": 6.025000095367432, "logps/chosen": -538.7999877929688, "logps/rejected": -572.2000122070312, "logits/chosen": -1.626562476158142, "logits/rejected": -1.576562523841858, "nll_loss": 0.513671875, "epoch": 0.6578947368421053, "step": 25}, {"loss": 0.46839599609375, "grad_norm": 0.8998878137633866, "learning_rate": 8.83022221559489e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081373, "rewards/chosen": 8.949999809265137, "rewards/rejected": 0.15253905951976776, "rewards/accuracies": 1.0, "rewards/margins": 8.787500381469727, "logps/chosen": -396.3999938964844, "logps/rejected": -498.3999938964844, "logits/chosen": -1.640625, "logits/rejected": -1.681249976158142, "nll_loss": 0.46757811307907104, "epoch": 0.7894736842105263, "step": 30}, {"loss": 0.5325042724609375, "grad_norm": 0.4025215648853001, "learning_rate": 8.323979328069689e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081424, "rewards/chosen": 9.512499809265137, "rewards/rejected": -0.964062511920929, "rewards/accuracies": 1.0, "rewards/margins": 10.462499618530273, "logps/chosen": -610.2000122070312, "logps/rejected": -542.7999877929688, "logits/chosen": -1.665624976158142, "logits/rejected": -1.6828124523162842, "nll_loss": 0.532031238079071, "epoch": 0.9210526315789473, "step": 35}, {"loss": 0.5092529296875, "grad_norm": 0.49646039527754704, "learning_rate": 7.74754489035403e-05, "memory(GiB)": 56.27, "train_speed(iter/s)": 0.081711, "rewards/chosen": 9.912500381469727, "rewards/rejected": -2.090625047683716, "rewards/accuracies": 1.0, "rewards/margins": 11.987500190734863, "logps/chosen": -476.79998779296875, "logps/rejected": -578.0, "logits/chosen": -1.6437499523162842, "logits/rejected": -1.703125, "nll_loss": 0.5648437738418579, "epoch": 1.0526315789473684, "step": 40}, {"eval_loss": 0.294677734375, "eval_runtime": 4.3578, "eval_samples_per_second": 0.918, "eval_steps_per_second": 0.229, "eval_rewards/chosen": 10.5, "eval_rewards/rejected": -0.69921875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 11.1875, "eval_logps/chosen": -136.0, "eval_logps/rejected": -1080.0, "eval_logits/chosen": -1.625, "eval_logits/rejected": -1.296875, "eval_nll_loss": 0.294921875, "epoch": 1.0526315789473684, "step": 40}, {"loss": 0.4313232421875, "grad_norm": 0.4969045495850996, "learning_rate": 7.113091308703498e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.080571, "rewards/chosen": 10.699999809265137, "rewards/rejected": -2.549999952316284, "rewards/accuracies": 1.0, "rewards/margins": 13.237500190734863, "logps/chosen": -444.6000061035156, "logps/rejected": -614.0, "logits/chosen": -1.6328125, "logits/rejected": -1.6015625, "nll_loss": 0.43085938692092896, "epoch": 1.1842105263157894, "step": 45}, {"loss": 0.4834228515625, "grad_norm": 0.2582495580909455, "learning_rate": 6.434016163555452e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.080901, "rewards/chosen": 11.25, "rewards/rejected": -2.135937452316284, "rewards/accuracies": 1.0, "rewards/margins": 13.399999618530273, "logps/chosen": -496.0, "logps/rejected": -584.4000244140625, "logits/chosen": -1.703125, "logits/rejected": -1.7218749523162842, "nll_loss": 0.4828124940395355, "epoch": 1.3157894736842106, "step": 50}, {"loss": 0.46617431640625, "grad_norm": 0.1378301574581502, "learning_rate": 5.724659296536233e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.08215, "rewards/chosen": 11.800000190734863, "rewards/rejected": -2.214062452316284, "rewards/accuracies": 1.0, "rewards/margins": 14.0, "logps/chosen": -475.6000061035156, "logps/rejected": -534.4000244140625, "logits/chosen": -1.584375023841858, "logits/rejected": -1.6453125476837158, "nll_loss": 0.466796875, "epoch": 1.4473684210526316, "step": 55}, {"loss": 0.42254638671875, "grad_norm": 0.12374674083229541, "learning_rate": 5e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082614, "rewards/chosen": 12.524999618530273, "rewards/rejected": -1.598046898841858, "rewards/accuracies": 1.0, "rewards/margins": 14.125, "logps/chosen": -515.2000122070312, "logps/rejected": -377.6000061035156, "logits/chosen": -1.5671875476837158, "logits/rejected": -1.6515624523162842, "nll_loss": 0.42304688692092896, "epoch": 1.5789473684210527, "step": 60}, {"eval_loss": 0.285400390625, "eval_runtime": 4.4113, "eval_samples_per_second": 0.907, "eval_steps_per_second": 0.227, "eval_rewards/chosen": 10.875, "eval_rewards/rejected": 0.2001953125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.6875, "eval_logps/chosen": -131.0, "eval_logps/rejected": -1072.0, "eval_logits/chosen": -1.6328125, "eval_logits/rejected": -1.2890625, "eval_nll_loss": 0.28515625, "epoch": 1.5789473684210527, "step": 60}, {"loss": 0.399969482421875, "grad_norm": 0.25020620326140985, "learning_rate": 4.275340703463767e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082354, "rewards/chosen": 12.475000381469727, "rewards/rejected": -2.171875, "rewards/accuracies": 1.0, "rewards/margins": 14.649999618530273, "logps/chosen": -453.20001220703125, "logps/rejected": -469.6000061035156, "logits/chosen": -1.6765625476837158, "logits/rejected": -1.5906250476837158, "nll_loss": 0.39921873807907104, "epoch": 1.7105263157894737, "step": 65}, {"loss": 0.4444091796875, "grad_norm": 0.26138680103939765, "learning_rate": 3.5659838364445505e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082355, "rewards/chosen": 12.287500381469727, "rewards/rejected": -1.6437499523162842, "rewards/accuracies": 1.0, "rewards/margins": 13.925000190734863, "logps/chosen": -408.6000061035156, "logps/rejected": -489.79998779296875, "logits/chosen": -1.640625, "logits/rejected": -1.703125, "nll_loss": 0.44453126192092896, "epoch": 1.8421052631578947, "step": 70}, {"loss": 0.4694580078125, "grad_norm": 0.44633184477159904, "learning_rate": 2.886908691296504e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082702, "rewards/chosen": 12.550000190734863, "rewards/rejected": -1.9695312976837158, "rewards/accuracies": 1.0, "rewards/margins": 14.512499809265137, "logps/chosen": -506.3999938964844, "logps/rejected": -582.4000244140625, "logits/chosen": -1.6015625, "logits/rejected": -1.5499999523162842, "nll_loss": 0.46953123807907104, "epoch": 1.973684210526316, "step": 75}, {"loss": 0.46090087890625, "grad_norm": 0.3116745813140644, "learning_rate": 2.25245510964597e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.08211, "rewards/chosen": 12.612500190734863, "rewards/rejected": -2.4468750953674316, "rewards/accuracies": 1.0, "rewards/margins": 15.0625, "logps/chosen": -433.6000061035156, "logps/rejected": -563.5999755859375, "logits/chosen": -1.6687500476837158, "logits/rejected": -1.609375, "nll_loss": 0.4710937440395355, "epoch": 2.1052631578947367, "step": 80}, {"eval_loss": 0.28369140625, "eval_runtime": 4.4281, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.226, "eval_rewards/chosen": 11.0, "eval_rewards/rejected": 1.5, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 9.5, "eval_logps/chosen": -130.0, "eval_logps/rejected": -1056.0, "eval_logits/chosen": -1.6171875, "eval_logits/rejected": -1.28125, "eval_nll_loss": 0.283203125, "epoch": 2.1052631578947367, "step": 80}, {"loss": 0.4051177978515625, "grad_norm": 0.3778186960664974, "learning_rate": 1.6760206719303105e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082, "rewards/chosen": 12.675000190734863, "rewards/rejected": -0.42578125, "rewards/accuracies": 1.0, "rewards/margins": 13.087499618530273, "logps/chosen": -384.3999938964844, "logps/rejected": -660.0, "logits/chosen": -1.5859375, "logits/rejected": -1.592187523841858, "nll_loss": 0.40507811307907104, "epoch": 2.236842105263158, "step": 85}, {"loss": 0.43151397705078126, "grad_norm": 0.18604144042181714, "learning_rate": 1.1697777844051105e-05, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082606, "rewards/chosen": 14.0, "rewards/rejected": -1.6233398914337158, "rewards/accuracies": 1.0, "rewards/margins": 15.612500190734863, "logps/chosen": -488.79998779296875, "logps/rejected": -490.0, "logits/chosen": -1.5515625476837158, "logits/rejected": -1.603124976158142, "nll_loss": 0.43085938692092896, "epoch": 2.3684210526315788, "step": 90}, {"loss": 0.4540283203125, "grad_norm": 0.14745380356896057, "learning_rate": 7.444166378150013e-06, "memory(GiB)": 57.72, "train_speed(iter/s)": 0.082454, "rewards/chosen": 13.8125, "rewards/rejected": -1.818750023841858, "rewards/accuracies": 1.0, "rewards/margins": 15.625, "logps/chosen": -521.2000122070312, "logps/rejected": -601.2000122070312, "logits/chosen": -1.640625, "logits/rejected": -1.6218750476837158, "nll_loss": 0.45390623807907104, "epoch": 2.5, "step": 95}, {"loss": 0.3977935791015625, "grad_norm": 0.27812231190579917, "learning_rate": 4.089194655986306e-06, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082789, "rewards/chosen": 12.837499618530273, "rewards/rejected": -2.390625, "rewards/accuracies": 1.0, "rewards/margins": 15.225000381469727, "logps/chosen": -400.0, "logps/rejected": -471.20001220703125, "logits/chosen": -1.626562476158142, "logits/rejected": -1.548437476158142, "nll_loss": 0.3980468809604645, "epoch": 2.6315789473684212, "step": 100}, {"eval_loss": 0.281494140625, "eval_runtime": 4.383, "eval_samples_per_second": 0.913, "eval_steps_per_second": 0.228, "eval_rewards/chosen": 11.0625, "eval_rewards/rejected": 0.90234375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.1875, "eval_logps/chosen": -129.0, "eval_logps/rejected": -1064.0, "eval_logits/chosen": -1.609375, "eval_logits/rejected": -1.2734375, "eval_nll_loss": 0.28125, "epoch": 2.6315789473684212, "step": 100}, {"loss": 0.4334808349609375, "grad_norm": 0.47078492381027764, "learning_rate": 1.70370868554659e-06, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082503, "rewards/chosen": 12.975000381469727, "rewards/rejected": -1.778906226158142, "rewards/accuracies": 1.0, "rewards/margins": 14.737500190734863, "logps/chosen": -436.20001220703125, "logps/rejected": -444.20001220703125, "logits/chosen": -1.6593749523162842, "logits/rejected": -1.6484375, "nll_loss": 0.4332031309604645, "epoch": 2.763157894736842, "step": 105}, {"loss": 0.385760498046875, "grad_norm": 0.27296904355561863, "learning_rate": 3.380821129028489e-07, "memory(GiB)": 66.97, "train_speed(iter/s)": 0.082596, "rewards/chosen": 13.912500381469727, "rewards/rejected": -1.814062476158142, "rewards/accuracies": 1.0, "rewards/margins": 15.737500190734863, "logps/chosen": -451.6000061035156, "logps/rejected": -476.0, "logits/chosen": -1.6171875, "logits/rejected": -1.634374976158142, "nll_loss": 0.38554686307907104, "epoch": 2.8947368421052633, "step": 110}, {"eval_loss": 0.28076171875, "eval_runtime": 4.3424, "eval_samples_per_second": 0.921, "eval_steps_per_second": 0.23, "eval_rewards/chosen": 11.0625, "eval_rewards/rejected": 0.65234375, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 10.4375, "eval_logps/chosen": -129.0, "eval_logps/rejected": -1064.0, "eval_logits/chosen": -1.609375, "eval_logits/rejected": -1.2734375, "eval_nll_loss": 0.28125, "epoch": 3.0, "step": 114}, {"train_runtime": 1388.2018, "train_samples_per_second": 0.644, "train_steps_per_second": 0.082, "total_flos": 478691537715200.0, "train_loss": 0.6084153694018983, "epoch": 3.0, "step": 114}], "memory": 66.970703125} diff --git a/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs/events.out.tfevents.1739305649.kml-task-540432-record-10109969-prod-worker-0.24199.0 b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs/events.out.tfevents.1739305649.kml-task-540432-record-10109969-prod-worker-0.24199.0 new file mode 100644 index 0000000000000000000000000000000000000000..80e2a52a44ede857a4e77196619ac78d17e2514e --- /dev/null +++ b/deepseek-r1-32b_400_0.5_dpo_4096_rank8_epoch3_what/v1-20250211-202546/runs/events.out.tfevents.1739305649.kml-task-540432-record-10109969-prod-worker-0.24199.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b841f7f82fe91e7579483187f92fc20254bc5edbb64231884cbc8e86306e616 +size 32191 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f7b4fd0fbba07aeed131f0110fb045ecf33baa7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3352fccaa2e07332fdee818c6396786d5ca8676b583b9b5b31c96401fe7e4c54 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8276447fc903c0bb1c687bd3be4bed327004740d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9caeddb13c9cb9e2f37fee44e2cbd17192fa3b7595ddf82ebd9b0a43c102b4e7 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e81090748fd21618f9c2284a525c7b0614690939 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:068d50ef5478ed573f32091a050de309f745f41072a36a3040d119271649ea2f +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a102fda63cf5dc0e796a56ae80ca3b94e5f5df6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea24214805cd4d0982eee888856ae3c8677a7e588a21e43d96cc6d12f074b5dd +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01001221b43ecf4bca27e9a073f06694c7d67575 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adadb9275afad31afa1e00b8cbf8353406616609d35b62a334fe24bc1e6b2eb8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86d49488ffec9d4a445785b854681dcb8ed7610b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4b4c01e80339a7be1e4b504e2f31468d5df77af06d59821a10d5bb29a4d683 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee5ae80f7fd50b46b8549217b9712905f5e1b101 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d98c53e4b31231e9a9250cc718459a7f9cb7c08c7a7b7e11b3fa55bf0d11c92 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f0a7809a0faca3f2efa700e42833ee6f094dd0f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e62dfbf96d314c74abe85a885c72d8df7d4790185d5e3f56b8bcc61fbd7d32e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4485791b2ba66410523ae02a3b7f9e92214e73e7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c801727baa5e6a45f616a5ed123d75c381e8495e3d4ea85bfe62e1414674b8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a824dd36aeeb461bcf24fdf65384617685ab58 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63fc52c4a93583d9ad1a3836a7962f19034a9214ea700213400e4e221d387ebf +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3030ad106e287244bdff00e3a480f834957bf98a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2805eac185e17dd4b95e30fc04f67d72aa81a6d73880fe5e9379cef1e5eb6849 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c64c71e0dbfaaf1d7d1180fb635af37c4883ce5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7025a7b98bdc6b48906710b0f90ff63b74c7dcffd11ccf5d3339ab5055d27a +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f307c60d54d063683f5211762ab54a9926dee0ad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9021c4d4ef3a89baee0711f5c7a98e7594780638178c4911da6b76c1b3d6ad58 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f17ea040e9ab6a061988521ac29614f857dbab5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f593d1f818e92016a3064904de1ca8d3c95a428a1c461d70d2b3707cc0bc08cd +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b5f8c06fd177293874596e9c2ac0303e0a197ce --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e1bbf993d78c31b2e89172cc59291164bae77a5a9420476e18675c7cedc0ec6 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f56a955eeaeebf1b6c1cb4ab4150de631f41dc19 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c72726d5c73af4fdcb1da59a45a3e232a4469863c1f548f8634359802bf5015 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97a4a5ca4846ec856e77100a09835cc006430632 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/global_step100/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f503458dbdd0f650b050f90804c781b0da4033f3af162b716d3b25fa8c0dac7f +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/latest new file mode 100644 index 0000000000000000000000000000000000000000..744ae7dbad571b6f37ec6c7066549494261bb59e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/latest @@ -0,0 +1 @@ +global_step100 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a1fb08c48e9d34df783eb19e7c9d1caf0ed386 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ec37c3a15b8d061312402391f2fddb52d623a1416d6d2879a30f184450d844f +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ed9d63d2fd7cece95c548286050b242e44ba7a57 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/trainer_state.json @@ -0,0 +1,496 @@ +{ + "best_metric": 0.40234375, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100", + "epoch": 2.6315789473684212, + "eval_steps": 20, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 5.9296423681187935, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -0.19545897841453552, + "logits/rejected": 0.32050782442092896, + "logps/chosen": -358.79998779296875, + "logps/rejected": -321.79998779296875, + "loss": 0.598291015625, + "memory(GiB)": 40.2, + "nll_loss": 0.5562499761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 4.737500190734863, + "rewards/rejected": 5.631249904632568, + "step": 25, + "train_speed(iter/s)": 0.282956 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8447788180902983, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.353515625, + "logits/rejected": 0.797656238079071, + "logps/chosen": -244.3000030517578, + "logps/rejected": -307.79998779296875, + "loss": 0.4865478515625, + "memory(GiB)": 40.2, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.324999809265137, + "rewards/margins": 6.90625, + "rewards/rejected": 4.412499904632568, + "step": 30, + "train_speed(iter/s)": 0.284899 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.0119251792419137, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.39433592557907104, + "logits/rejected": 0.728515625, + "logps/chosen": -372.79998779296875, + "logps/rejected": -333.0, + "loss": 0.4839599609375, + "memory(GiB)": 40.2, + "nll_loss": 0.4814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.350000381469727, + "rewards/margins": 9.037500381469727, + "rewards/rejected": 3.3062500953674316, + "step": 35, + "train_speed(iter/s)": 0.286585 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.0093735109752326, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -0.12558594346046448, + "logits/rejected": 0.776171863079071, + "logps/chosen": -264.79998779296875, + "logps/rejected": -355.20001220703125, + "loss": 0.44349365234375, + "memory(GiB)": 40.2, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 10.618749618530273, + "rewards/rejected": 1.6574218273162842, + "step": 40, + "train_speed(iter/s)": 0.288367 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.453125, + "eval_logits/rejected": 1.6875, + "eval_logps/chosen": -251.0, + "eval_logps/rejected": -496.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.4453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.3125, + "eval_rewards/margins": 13.375, + "eval_rewards/rejected": -0.07421875, + "eval_runtime": 1.3301, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.7072609823649598, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -0.19873046875, + "logits/rejected": 1.137304663658142, + "logps/chosen": -304.20001220703125, + "logps/rejected": -368.3999938964844, + "loss": 0.44085693359375, + "memory(GiB)": 40.2, + "nll_loss": 0.4410156309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.087499618530273, + "rewards/margins": 13.5625, + "rewards/rejected": 0.51171875, + "step": 45, + "train_speed(iter/s)": 0.284733 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5247521627933828, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": 0.0985107421875, + "logits/rejected": 1.3796875476837158, + "logps/chosen": -297.3999938964844, + "logps/rejected": -401.6000061035156, + "loss": 0.426397705078125, + "memory(GiB)": 40.2, + "nll_loss": 0.4263671934604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.949999809265137, + "rewards/margins": 13.774999618530273, + "rewards/rejected": 1.167578101158142, + "step": 50, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.27152778325772464, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -0.18417969346046448, + "logits/rejected": 1.4109375476837158, + "logps/chosen": -258.0, + "logps/rejected": -356.3999938964844, + "loss": 0.4094970703125, + "memory(GiB)": 40.2, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.487500190734863, + "rewards/margins": 13.762499809265137, + "rewards/rejected": 0.719531238079071, + "step": 55, + "train_speed(iter/s)": 0.29118 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23345574148400033, + "learning_rate": 5e-05, + "logits/chosen": -0.095703125, + "logits/rejected": 1.083593726158142, + "logps/chosen": -260.79998779296875, + "logps/rejected": -298.6000061035156, + "loss": 0.27684326171875, + "memory(GiB)": 40.2, + "nll_loss": 0.2767578065395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.274999618530273, + "rewards/margins": 13.574999809265137, + "rewards/rejected": 1.7109375, + "step": 60, + "train_speed(iter/s)": 0.293391 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.15625, + "eval_logits/rejected": 1.859375, + "eval_logps/chosen": -239.0, + "eval_logps/rejected": -494.0, + "eval_loss": 0.421142578125, + "eval_nll_loss": 0.421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.625, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": 0.10107421875, + "eval_runtime": 1.3288, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6365766166484632, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.505664050579071, + "logits/rejected": 1.177148461341858, + "logps/chosen": -259.79998779296875, + "logps/rejected": -357.20001220703125, + "loss": 0.3337158203125, + "memory(GiB)": 40.2, + "nll_loss": 0.33378905057907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.637499809265137, + "rewards/margins": 14.550000190734863, + "rewards/rejected": 1.1103515625, + "step": 65, + "train_speed(iter/s)": 0.292443 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5652403639285394, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -0.01230468787252903, + "logits/rejected": 1.0339844226837158, + "logps/chosen": -191.60000610351562, + "logps/rejected": -327.3999938964844, + "loss": 0.3323974609375, + "memory(GiB)": 40.2, + "nll_loss": 0.3324218690395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.324999809265137, + "rewards/rejected": 1.1160156726837158, + "step": 70, + "train_speed(iter/s)": 0.292808 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.9034319408629943, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -0.39921873807907104, + "logits/rejected": 1.267187476158142, + "logps/chosen": -294.20001220703125, + "logps/rejected": -371.20001220703125, + "loss": 0.39881591796875, + "memory(GiB)": 40.2, + "nll_loss": 0.3984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.975000381469727, + "rewards/margins": 15.925000190734863, + "rewards/rejected": 0.05000000074505806, + "step": 75, + "train_speed(iter/s)": 0.293937 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.6395555179201285, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -0.22714844346046448, + "logits/rejected": 1.212499976158142, + "logps/chosen": -236.60000610351562, + "logps/rejected": -388.0, + "loss": 0.356890869140625, + "memory(GiB)": 40.2, + "nll_loss": 0.3724609315395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.337499618530273, + "rewards/margins": 15.037500381469727, + "rewards/rejected": 0.32695311307907104, + "step": 80, + "train_speed(iter/s)": 0.292172 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.1484375, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -233.0, + "eval_logps/rejected": -490.0, + "eval_loss": 0.404296875, + "eval_nll_loss": 0.404296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.1875, + "eval_rewards/margins": 14.5625, + "eval_rewards/rejected": 0.625, + "eval_runtime": 1.3795, + "eval_samples_per_second": 2.9, + "eval_steps_per_second": 0.725, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.8355260792737639, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -0.3270507752895355, + "logits/rejected": 1.427343726158142, + "logps/chosen": -249.60000610351562, + "logps/rejected": -412.3999938964844, + "loss": 0.344207763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.34394532442092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0625, + "rewards/margins": 14.850000381469727, + "rewards/rejected": 1.1875, + "step": 85, + "train_speed(iter/s)": 0.291482 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.30799153697843773, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -0.4400390684604645, + "logits/rejected": 0.818359375, + "logps/chosen": -251.8000030517578, + "logps/rejected": -320.0, + "loss": 0.320977783203125, + "memory(GiB)": 40.2, + "nll_loss": 0.32109373807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.200000762939453, + "rewards/margins": 14.762499809265137, + "rewards/rejected": 1.398828148841858, + "step": 90, + "train_speed(iter/s)": 0.293824 + }, + { + "epoch": 2.5, + "grad_norm": 0.2539868135825997, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -0.24687500298023224, + "logits/rejected": 1.1484375, + "logps/chosen": -312.20001220703125, + "logps/rejected": -399.6000061035156, + "loss": 0.41552734375, + "memory(GiB)": 40.2, + "nll_loss": 0.41523438692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.299999237060547, + "rewards/margins": 16.950000762939453, + "rewards/rejected": 0.38164061307907104, + "step": 95, + "train_speed(iter/s)": 0.293791 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4538384538590968, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -0.552197277545929, + "logits/rejected": 1.428125023841858, + "logps/chosen": -243.39999389648438, + "logps/rejected": -329.6000061035156, + "loss": 0.34947509765625, + "memory(GiB)": 49.45, + "nll_loss": 0.3490234315395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.524999618530273, + "rewards/margins": 15.837499618530273, + "rewards/rejected": 0.691601574420929, + "step": 100, + "train_speed(iter/s)": 0.295106 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -1.109375, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -231.0, + "eval_logps/rejected": -492.0, + "eval_loss": 0.40234375, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.375, + "eval_rewards/margins": 15.125, + "eval_rewards/rejected": 0.30078125, + "eval_runtime": 1.3229, + "eval_samples_per_second": 3.024, + "eval_steps_per_second": 0.756, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 127239277510656.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-100/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc990067476423738ac742afb26fdc699e733cc1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3d11bdadf09689f3689d6ba6fd5df6c79fb6d7d42ebfaca4317d89566dfec7 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98841abc2cb5435dd74e5b6623bc696d08128f6c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04abf725d12e0f8e8918e0cfa8e803d32e149511dda3a8ba68ac8b1b99995a82 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68684f5f051b7ad9de1b4baa758668df6250ad90 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de01f360ef39e89cf5200b58dc80fe036299e2eae6f9cffa81f1b66db9fcb8a9 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f543a9f9704229a6573059541f41c4591b430e4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef340b9922197b67767cbd449b7436dc216a45bdae707a390876c523370f7d84 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2284969b9aaeba94e7dec6699c0d77b50f2f859d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69271e486e9daf678bc7de0c43b0092aa21d01ee2fe6781bec33dd2b5660fd07 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fd740764a8ba7ce7c5477b61a876cbfea94a8c9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11d62653cc95f81ce1e330b587c37e402ba7d8a7295deaec6c6420930dcdd51d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3b9814afef93adbee4fb3e16571b940d589e96e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:086c8102d5dfd0c014f269e512014d0703149a9d0b220c2210b650a776799990 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac262249de08ad66a5a71d67071d8e73d3d984b4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc025a3b7435c56bf5cda7c1cd5a692b573a2ecd05901b82947a3375245522d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..344513c764a9e81fee8f13f51c91cadcefd8298c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d9aece3323245357ffe6399e3e51fef90ada66c42bb5095187de8f9034ba41 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8105dae45351bc9cd8b211c09f37bb5810d95eec --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21b7abcceb9145df5a06a246581437a97934a2f8604dbf3c79a9a24c8810082f +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe29e9f02f4c93f6a4a80ed47ee493663ea2869 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047538f1cc9f7c5ced220fbb45f0ca9f60490275935b59b4ffa604e31036970b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73562b6988c82a4a8c844c641c8f3d6f330e77ed --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6ca18f7097ff28a90d8330b59dff6517f5dd7fcf4d416ce0ca9e03963aeff92 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..753f6a2f1281abee13b649b05f5562881ad567c0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c45b61d8ca6f2fb536ccc41020c68cdc99525321ad7e8f0559c0cd39e64ec647 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c5678c679c42abe517d73591596fdc55c4ae36c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1544cf6f6b337811d0d66e2ab7c960f567172e6768154013d3ad5bf104780e7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..645d841d3b4fc8aec42bb33a8e158e33118a9e86 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50d991fd917a72eecd18f87f0933d1de6f0f5d4868c46317960defa5318a7ac1 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc20e3e0e7f464801f035b049d529536f9456b86 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1788ae209de6f083ea5e6d315e60f7ca77e5e12c3bfb1af3c47049d284bc1fe3 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16318a8152fc653ccbb55a440f9c7789db0458bf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/global_step114/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338e769473a5bd5e6a6e71459d5ff3a55f43595da5cc253baed8259c1dcd7cb9 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/latest new file mode 100644 index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/latest @@ -0,0 +1 @@ +global_step114 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee83ae5e323e0bb676daf05f7f41b7951b49c7af --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9162e03c562553a5d9d13120f544d3c47ea71bb39aa44e18253675e17ed4a4 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cd0edf74beb406ae74d27fac689e74cc1a7d12b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4809456871b3a40c8db7e0926a9db11b01149a1d483fb29b16fc69dabaf36c6f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..378e4e23e02084387cef58f5bfa08ef5b23ef1b3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb6bcf25ff148b74eea7dd4895fc42e9433538fff5d75f0d2ae6cb0c2fdadf0 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c9d23b00a6e62ab23a83b688e4077471f0501ba0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f00ea04cd1a52c539d9cc948ac8a04676d6b99702acd09149565f781806f63f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf6105fec105f5636599de6b5ea414adc300ed30 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5571fb2fc1b413792b01ac691c759786855573992bab1d14875faccdaf8c881e +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..983c7580e17a958602e3218e885e88e85d4ed9a0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59019ba23ead9c15851cb4349397254458ce50ea3c2987090404f4f3842c6d8f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f87fedb0a1eac5d251eeb1e7cf58190877f6b60 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fdffda57fda4a555da7a5de6fc6ec7324e0dae048b92519af6c4f6a1bc7412 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d32d0d7a4ca68837a8e91f7101758f2f48116bde --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fb2c13e63aba83c4505fae1639f79a33853d8f1bebe20cecb73bf53c8e7c46 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a86ac614a477eb67963adb2c8c07f37c79ded059 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7a9fd18bda7faa50931342147a7de5605bed0f91f6c70d821e84b7bf8f444f +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..429f571ce1540e386f423e5e35b61e312df928e7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/trainer_state.json @@ -0,0 +1,549 @@ +{ + "best_metric": 0.40185547, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114", + "epoch": 3.0, + "eval_steps": 20, + "global_step": 114, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 5.9296423681187935, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -0.19545897841453552, + "logits/rejected": 0.32050782442092896, + "logps/chosen": -358.79998779296875, + "logps/rejected": -321.79998779296875, + "loss": 0.598291015625, + "memory(GiB)": 40.2, + "nll_loss": 0.5562499761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 4.737500190734863, + "rewards/rejected": 5.631249904632568, + "step": 25, + "train_speed(iter/s)": 0.282956 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8447788180902983, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.353515625, + "logits/rejected": 0.797656238079071, + "logps/chosen": -244.3000030517578, + "logps/rejected": -307.79998779296875, + "loss": 0.4865478515625, + "memory(GiB)": 40.2, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.324999809265137, + "rewards/margins": 6.90625, + "rewards/rejected": 4.412499904632568, + "step": 30, + "train_speed(iter/s)": 0.284899 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.0119251792419137, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.39433592557907104, + "logits/rejected": 0.728515625, + "logps/chosen": -372.79998779296875, + "logps/rejected": -333.0, + "loss": 0.4839599609375, + "memory(GiB)": 40.2, + "nll_loss": 0.4814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.350000381469727, + "rewards/margins": 9.037500381469727, + "rewards/rejected": 3.3062500953674316, + "step": 35, + "train_speed(iter/s)": 0.286585 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.0093735109752326, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -0.12558594346046448, + "logits/rejected": 0.776171863079071, + "logps/chosen": -264.79998779296875, + "logps/rejected": -355.20001220703125, + "loss": 0.44349365234375, + "memory(GiB)": 40.2, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 10.618749618530273, + "rewards/rejected": 1.6574218273162842, + "step": 40, + "train_speed(iter/s)": 0.288367 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.453125, + "eval_logits/rejected": 1.6875, + "eval_logps/chosen": -251.0, + "eval_logps/rejected": -496.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.4453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.3125, + "eval_rewards/margins": 13.375, + "eval_rewards/rejected": -0.07421875, + "eval_runtime": 1.3301, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.7072609823649598, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -0.19873046875, + "logits/rejected": 1.137304663658142, + "logps/chosen": -304.20001220703125, + "logps/rejected": -368.3999938964844, + "loss": 0.44085693359375, + "memory(GiB)": 40.2, + "nll_loss": 0.4410156309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.087499618530273, + "rewards/margins": 13.5625, + "rewards/rejected": 0.51171875, + "step": 45, + "train_speed(iter/s)": 0.284733 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5247521627933828, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": 0.0985107421875, + "logits/rejected": 1.3796875476837158, + "logps/chosen": -297.3999938964844, + "logps/rejected": -401.6000061035156, + "loss": 0.426397705078125, + "memory(GiB)": 40.2, + "nll_loss": 0.4263671934604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.949999809265137, + "rewards/margins": 13.774999618530273, + "rewards/rejected": 1.167578101158142, + "step": 50, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.27152778325772464, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -0.18417969346046448, + "logits/rejected": 1.4109375476837158, + "logps/chosen": -258.0, + "logps/rejected": -356.3999938964844, + "loss": 0.4094970703125, + "memory(GiB)": 40.2, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.487500190734863, + "rewards/margins": 13.762499809265137, + "rewards/rejected": 0.719531238079071, + "step": 55, + "train_speed(iter/s)": 0.29118 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23345574148400033, + "learning_rate": 5e-05, + "logits/chosen": -0.095703125, + "logits/rejected": 1.083593726158142, + "logps/chosen": -260.79998779296875, + "logps/rejected": -298.6000061035156, + "loss": 0.27684326171875, + "memory(GiB)": 40.2, + "nll_loss": 0.2767578065395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.274999618530273, + "rewards/margins": 13.574999809265137, + "rewards/rejected": 1.7109375, + "step": 60, + "train_speed(iter/s)": 0.293391 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.15625, + "eval_logits/rejected": 1.859375, + "eval_logps/chosen": -239.0, + "eval_logps/rejected": -494.0, + "eval_loss": 0.421142578125, + "eval_nll_loss": 0.421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.625, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": 0.10107421875, + "eval_runtime": 1.3288, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6365766166484632, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.505664050579071, + "logits/rejected": 1.177148461341858, + "logps/chosen": -259.79998779296875, + "logps/rejected": -357.20001220703125, + "loss": 0.3337158203125, + "memory(GiB)": 40.2, + "nll_loss": 0.33378905057907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.637499809265137, + "rewards/margins": 14.550000190734863, + "rewards/rejected": 1.1103515625, + "step": 65, + "train_speed(iter/s)": 0.292443 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5652403639285394, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -0.01230468787252903, + "logits/rejected": 1.0339844226837158, + "logps/chosen": -191.60000610351562, + "logps/rejected": -327.3999938964844, + "loss": 0.3323974609375, + "memory(GiB)": 40.2, + "nll_loss": 0.3324218690395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.324999809265137, + "rewards/rejected": 1.1160156726837158, + "step": 70, + "train_speed(iter/s)": 0.292808 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.9034319408629943, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -0.39921873807907104, + "logits/rejected": 1.267187476158142, + "logps/chosen": -294.20001220703125, + "logps/rejected": -371.20001220703125, + "loss": 0.39881591796875, + "memory(GiB)": 40.2, + "nll_loss": 0.3984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.975000381469727, + "rewards/margins": 15.925000190734863, + "rewards/rejected": 0.05000000074505806, + "step": 75, + "train_speed(iter/s)": 0.293937 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.6395555179201285, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -0.22714844346046448, + "logits/rejected": 1.212499976158142, + "logps/chosen": -236.60000610351562, + "logps/rejected": -388.0, + "loss": 0.356890869140625, + "memory(GiB)": 40.2, + "nll_loss": 0.3724609315395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.337499618530273, + "rewards/margins": 15.037500381469727, + "rewards/rejected": 0.32695311307907104, + "step": 80, + "train_speed(iter/s)": 0.292172 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.1484375, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -233.0, + "eval_logps/rejected": -490.0, + "eval_loss": 0.404296875, + "eval_nll_loss": 0.404296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.1875, + "eval_rewards/margins": 14.5625, + "eval_rewards/rejected": 0.625, + "eval_runtime": 1.3795, + "eval_samples_per_second": 2.9, + "eval_steps_per_second": 0.725, + "step": 80 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.8355260792737639, + "learning_rate": 1.6760206719303105e-05, + "logits/chosen": -0.3270507752895355, + "logits/rejected": 1.427343726158142, + "logps/chosen": -249.60000610351562, + "logps/rejected": -412.3999938964844, + "loss": 0.344207763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.34394532442092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.0625, + "rewards/margins": 14.850000381469727, + "rewards/rejected": 1.1875, + "step": 85, + "train_speed(iter/s)": 0.291482 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.30799153697843773, + "learning_rate": 1.1697777844051105e-05, + "logits/chosen": -0.4400390684604645, + "logits/rejected": 0.818359375, + "logps/chosen": -251.8000030517578, + "logps/rejected": -320.0, + "loss": 0.320977783203125, + "memory(GiB)": 40.2, + "nll_loss": 0.32109373807907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.200000762939453, + "rewards/margins": 14.762499809265137, + "rewards/rejected": 1.398828148841858, + "step": 90, + "train_speed(iter/s)": 0.293824 + }, + { + "epoch": 2.5, + "grad_norm": 0.2539868135825997, + "learning_rate": 7.444166378150013e-06, + "logits/chosen": -0.24687500298023224, + "logits/rejected": 1.1484375, + "logps/chosen": -312.20001220703125, + "logps/rejected": -399.6000061035156, + "loss": 0.41552734375, + "memory(GiB)": 40.2, + "nll_loss": 0.41523438692092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 17.299999237060547, + "rewards/margins": 16.950000762939453, + "rewards/rejected": 0.38164061307907104, + "step": 95, + "train_speed(iter/s)": 0.293791 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4538384538590968, + "learning_rate": 4.089194655986306e-06, + "logits/chosen": -0.552197277545929, + "logits/rejected": 1.428125023841858, + "logps/chosen": -243.39999389648438, + "logps/rejected": -329.6000061035156, + "loss": 0.34947509765625, + "memory(GiB)": 49.45, + "nll_loss": 0.3490234315395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.524999618530273, + "rewards/margins": 15.837499618530273, + "rewards/rejected": 0.691601574420929, + "step": 100, + "train_speed(iter/s)": 0.295106 + }, + { + "epoch": 2.6315789473684212, + "eval_logits/chosen": -1.109375, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -231.0, + "eval_logps/rejected": -492.0, + "eval_loss": 0.40234375, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.375, + "eval_rewards/margins": 15.125, + "eval_rewards/rejected": 0.30078125, + "eval_runtime": 1.3229, + "eval_samples_per_second": 3.024, + "eval_steps_per_second": 0.756, + "step": 100 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.793720059226471, + "learning_rate": 1.70370868554659e-06, + "logits/chosen": -0.35712891817092896, + "logits/rejected": 1.049218773841858, + "logps/chosen": -223.8000030517578, + "logps/rejected": -309.0, + "loss": 0.312152099609375, + "memory(GiB)": 49.45, + "nll_loss": 0.3121093809604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.675000190734863, + "rewards/rejected": 1.4679687023162842, + "step": 105, + "train_speed(iter/s)": 0.293988 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.44121695697744323, + "learning_rate": 3.380821129028489e-07, + "logits/chosen": -0.4349609315395355, + "logits/rejected": 1.498437523841858, + "logps/chosen": -209.1999969482422, + "logps/rejected": -315.0, + "loss": 0.30657806396484377, + "memory(GiB)": 49.45, + "nll_loss": 0.306640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 16.125, + "rewards/margins": 14.487500190734863, + "rewards/rejected": 1.6531250476837158, + "step": 110, + "train_speed(iter/s)": 0.294539 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -1.1171875, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -231.0, + "eval_logps/rejected": -492.0, + "eval_loss": 0.40185546875, + "eval_nll_loss": 0.40234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.375, + "eval_rewards/margins": 15.125, + "eval_rewards/rejected": 0.30078125, + "eval_runtime": 1.2824, + "eval_samples_per_second": 3.119, + "eval_steps_per_second": 0.78, + "step": 114 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 144718705983488.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6dd80bdda84a136effff44b116fd61aaa2cd939e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0884008064cf8e2cc7318ac876915da4e8b5dcce5bfc9d49e35ce57644e7e20 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f53626449da72451b665cb5e386d153cac704288 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c0f689fe77059b5a2c33944185bbc08962e1f663dbbe7e13230831171289b3 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a2627686a5de2e863599525ce436f734fe21bd7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfa9d3b8db98a1f5fbd926077e47656efcf9341d5a570fbf3e1dc85e117c141 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..616573776ecde1b987219ea4fc3f55fab708511a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30332adad3e22aeafb2e37bd300995ab200bef63c31f47d3a035bb09ae746615 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c10b8e3192c97576ce266013c974d9d831f4ead --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12f50b48e33ae714d5425fd9d5174c5ef2cc1e2055a48e946759fbacadfe6ddd +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c201cdd76566f0f6fc7d016c1b37f1b6d7950707 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3725e6d105fc2ab440d9d18ffdc6b8f9a759f118a91ec8f1ef08e5b2fb73828 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67a43c4b5d8ac8f5190c3aa073db99c49ff754d2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22d5809234c1cf3ad66a263c19ee28c59a1ed446126bb28dad8d0820c192dcd +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f1fed8b099f3a950ab5857fc24e0c79867769ea --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04f2567c8efa06d1cac6b6cb878eda73259438c293206b32b5dfbaef36bb57ec +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e458a95594177bb12a1293e9500bdac5c7883b7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30241e065c0955de1b6723e82a69b22b1f07d2d5f104e16ec5eac3b03596f35b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6904b5a1b81fc9b3c24004507cf00de0d8f1866c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93ff8d61709de771ae14054e704e8a6889ba05ca80bfe73023c59a807111e4bb +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..405d213a9ae32dd2225213f76a357690888b6423 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616a9d019ce3a80a348e450659e5d858fd578014785283766ca90008d3eda5df +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7793ee191f11b0047153797fc07b373c39b0be58 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e81ff908795ff450171816606ded8213d1607a5e1796ac1d80674a9f7592586 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..547d5d0f3751b58263786e4a61365c68bd768cae --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50d2998cb2eb0f2a3b20ddfec684fb71b15c97ddd9f9e84b0a5ae7b1f327b6d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c11125c9da4584c68c1585e23ca599449c01eb86 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c771b9abf6547621c6351f89b173cadc940d7c3189b275bb16c974bd16d7cee +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..199bfb18a46c2827d070ddc0b71f5e23eb190fe1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f39ad204f6fe51202ae1dfc58cffe8b6278c6e927b4e76346c40375539198d +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddfc3cba267577506a2499cd9e493909878af3ee --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d8973ba1133525b9236f8c3ae1d2b0f657860f798695600022a69ae75a96bfd +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b85ff459db6ba2045710229b21d451151a3306f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/global_step20/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e11602431ae204c14284429098328b678869ac9ed6806afe37a161cc6240a0 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63223cdf01f44f9f3129915f9de3d647f31 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/latest @@ -0,0 +1 @@ +global_step20 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b346349ce12dd5a17d4b91ed2a5722bb52550950 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..68f3c6994456cb8d0592a5375d99503c8924b1c4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..be044f6ceeed587d30e80c2f72d5aa19fdc9947b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc825249656a9b858782542bd3f4386250f1dfe0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..d30f52a44be563c152ae09db6ae934da6da0d3ed --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8715d27ab23ae545d58039cf949cc44ecc1da5e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ed791b6ef76eadf0b0c55a5733411771e2ae027 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..800c3bbbc5edf7db01a8316069d439c5fb8d8c30 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e201402bb36891e48e2b7110304ad87df61a6070 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b40f5e8ba2f299f4eda41d6964ef1f313f53d1f8f687ebd6938ce3242fb4c3 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cd7f2ddd9f7f33c5a6d024df61f80ca792997ba9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/trainer_state.json @@ -0,0 +1,140 @@ +{ + "best_metric": 0.53515625, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20", + "epoch": 0.5263157894736842, + "eval_steps": 20, + "global_step": 20, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 24456134656000.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-20/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bdddefc6a3884007e0d3a065280b9704b45d270 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ef855af2f49be29a627b3a28cf71c51745e8af46855d587f92730e2782dcac +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4ad17360175ba9e6959a51437dade03ea17d739 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5ff7ec937a3a094262a9cb5b419b4f83d818295182317054c0dfdfdde66ef8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bee06d6588e90bbc90dc9a2599526f87a66aba63 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1ea08a80f599eb9d3e8a279189b0eec9a6a01b26f0795879a7e6fedbd02599 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f6de507238b3aa1dcf973e455322932907cf9f3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6fde08d3c31d29567d109b0cf004881afa0aace32226e261d325414457de72b +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..458803a859d7876e6c7af19559cca71471afded1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815fc72ad1e762fdc9ca63e8c8b896571eafccb679d89750c8318baf59711bbf +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0eef23f6b575399d2629ebbaa08fad04a20f383 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73af17e4bdb040673a43c1663743ad680d873b165c52c0b4cdd76da67e9641f8 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..920238e20b831fb61c30b85beec6655a62c664cf --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2605ad7e2812073fb9755e5c79ce134148de4d970dbc4a9391c0437d20613365 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae3c19596a33c3a5cb76b9410c99006f5090e046 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17540695b45bdf4a9e259243160d97464758f1e11bedf58df14410b3a4c869e9 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cb351addaa11cf2a19ac5c1c5575e33984dd6c5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:634e92e7ac9eb56dc70ea8cd4fc9a83543555199d2d5671fd0b180dc57a95872 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..361b10082fac59dd97ab966a7018f0d7b3b7cd64 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff1b84a0bd0bbab907145871d3ecb84acbf44bc9facff87544c584d0f5d0c48 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..719fd2aec466dc90e4eed4a171a7b904d7b1e645 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59d440564ab5cf3a74da88232b2d6392fc6f4204c56b45cade83f37d8b58232 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb2aead2d6f0fa652f6070a4d54ef176d78be7a4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47e4d48fbd6b1d18da1094fca2387abf9ef78bc9a18a60ffcdafda9be087473f +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f221844b4c22c03463188c73cf03ed77bfedb5ec --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e02c5784f0785f2a728589c208caed540adbf5d9bdc738e838a1f53a22f4f8 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..465705aad5e4a3372443495b196ec893cb0d5680 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b818d9ad2d69f79be47c10ef4f544cc2be83f6c701314b52cec306c0da7aae10 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..746e6894c65c669b87a9d4ac9b3da779b7200877 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5f9903b3c41153983cc4fd3c5d4aac8bc9370598c8979979ff6082bb870a1b +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b14ccd795d9fce191a618951fcf404d11f86084 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bcd2b28a6578ccf3230dfadb946cf8d9bd04c4dbf13ccb26b7b9cd12d2dd220 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8725b940d74f93a9297bc9accbea5b1f3b453ffb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/global_step40/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a275762e49847c884a5185e84648ea8e035448016eb22e4d5934ad613161cbc4 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/latest new file mode 100644 index 0000000000000000000000000000000000000000..8631ab8ddebf60eb3e7f5f2c2b1a2da8298a43c3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/latest @@ -0,0 +1 @@ +global_step40 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e5b7e2ec90fdb824c8932464c1d9068330655a7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d8d7722fc72cab6d492b76cb99c8177dcc47544 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c9f84eff30cfa9ea1feedaf262d61fb12e4cba7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..6eebfb928f8e91eff0ea1645a20b5aa4465c705b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..0866030a266c6d003cc378a9418a723f69e8ab99 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..554638d77107f832d7aa51c61645ee2d6c48a36d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..964331b65172a1bcac03e4673415fa787f724268 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd4754d65217d0f9d1f2d3334397df7a8a079652 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d2abd2d1feb7e9804d318f0409ab46d47248ca5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0cfcde03016592eed8191f897341f523bbb99d728821c8afed66eae5a64729 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..72b678a2bd81a55fe4c39492e93a2d17bfd28616 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/trainer_state.json @@ -0,0 +1,229 @@ +{ + "best_metric": 0.4453125, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40", + "epoch": 1.0526315789473684, + "eval_steps": 20, + "global_step": 40, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 5.9296423681187935, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -0.19545897841453552, + "logits/rejected": 0.32050782442092896, + "logps/chosen": -358.79998779296875, + "logps/rejected": -321.79998779296875, + "loss": 0.598291015625, + "memory(GiB)": 40.2, + "nll_loss": 0.5562499761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 4.737500190734863, + "rewards/rejected": 5.631249904632568, + "step": 25, + "train_speed(iter/s)": 0.282956 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8447788180902983, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.353515625, + "logits/rejected": 0.797656238079071, + "logps/chosen": -244.3000030517578, + "logps/rejected": -307.79998779296875, + "loss": 0.4865478515625, + "memory(GiB)": 40.2, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.324999809265137, + "rewards/margins": 6.90625, + "rewards/rejected": 4.412499904632568, + "step": 30, + "train_speed(iter/s)": 0.284899 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.0119251792419137, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.39433592557907104, + "logits/rejected": 0.728515625, + "logps/chosen": -372.79998779296875, + "logps/rejected": -333.0, + "loss": 0.4839599609375, + "memory(GiB)": 40.2, + "nll_loss": 0.4814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.350000381469727, + "rewards/margins": 9.037500381469727, + "rewards/rejected": 3.3062500953674316, + "step": 35, + "train_speed(iter/s)": 0.286585 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.0093735109752326, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -0.12558594346046448, + "logits/rejected": 0.776171863079071, + "logps/chosen": -264.79998779296875, + "logps/rejected": -355.20001220703125, + "loss": 0.44349365234375, + "memory(GiB)": 40.2, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 10.618749618530273, + "rewards/rejected": 1.6574218273162842, + "step": 40, + "train_speed(iter/s)": 0.288367 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.453125, + "eval_logits/rejected": 1.6875, + "eval_logps/chosen": -251.0, + "eval_logps/rejected": -496.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.4453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.3125, + "eval_rewards/margins": 13.375, + "eval_rewards/rejected": -0.07421875, + "eval_runtime": 1.3301, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "step": 40 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 50930832113664.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-40/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fce172fbe3acd08df8e6098d4e54649452abb24c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afee13ac8e0d16d92b990e8e7a634b18b064fdf5d9e4f71f45c05be20964a918 +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b0671e17cc312b752fa9f405d2ca22eade6f7d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a13c775d013875fc46a96fe06de84612a8721821da58f3a1db1a2763fcdfd751 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c6468bc15a04153ee6a237ac58b5d5d31992365 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6dce8327b48dd9b556e8d5ba6198b70757e3af8312ddd0681665a793996b5ce +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58998f8fcbe9c7b1c8e4385d3058e1d4661a4881 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85dedee7b86744963a68650faacbec8680f7def628bdbfce918da0c161acb3d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..278e9bc95f8b0208e7c14cca1b1d881cb6240a5c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9aa9683ab348f47158d7a9e282e0bd9d44f63ea8db9b333526933dd7d5982e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59f8d097bdfe93376ab81a14eecae4231a2edfc5 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:254d5738967a3adbe98cfea66cac16fbdb5c8e6b55fb47109d06f4dea73c635d +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2327e7346c999a2e6b602a6679d1df674785cdd0 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268dee06de8a032b631eeba1381d4fd31204b37922f95bb99dc0fe030307c281 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bac13ed7680bd0b839b27e834ce7eabb2349499f --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ec05dda1be3035c3f3f73fde34fedc266f5603930eb22234372c4b289ba792 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2d2ad7fa01ec69d8dd7de9efcc6e4b9e237bbbc --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ec7dceffffcda8b29cdacdebc52718856932e0b0d9a099fba8f254d530125c +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7a041dbf9a890ff74d1f95010742b5e98365ade --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30230a5b28eaff72c19a92d47655a839abf5ed00f7ce071f2684334b043891bc +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b7b22f791cbb21ce50f140d2596d5b3c824f7b4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9e3df898faf84be32c8c5fcc36380d466288b6d738e60099ebeb6524d65a2ef +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59041ab232214211a4666132871187c1598ab0f7 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af15af88eb236f20b2c21dc57718fde6481f127e0755c4cdc79d2619759f73a +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..971065387b1f9d16a707621929a061e85385d3d2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba44a30a3cf1a04bfc5860dbc080e42a8e154dc10c8b929c8315539566a4ee4 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfd0e4ff4d9774e65bf3354f3a9b7294572956f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d802e73cd7b1daebec622dab8d182090da21676a4ae0cb13077a59f28f6a7fbf +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cad5c9962b0d28bbd9a28fc1e90203140126298 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3e62fb9c149ef91c83d0b1c139d89123a336d909aa9a4eabad2a1a53961fd5 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f09344a3199befd4087b18534c43bb54ed15e55 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c36bac464b0c9840a9c1d83ec49aa133c1235c69e39766a0e3c59058096f9564 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36fadef4921cc151b7896a1eb22d8886e9d8fddd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/global_step60/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f356ad790fd22622eca8109aa6c5963ed6f1434c10eb668d86bc7d47e0b66c2e +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/latest new file mode 100644 index 0000000000000000000000000000000000000000..6dac34b840ecfb636ba8ab1e4da79fa1bdc8c3d4 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/latest @@ -0,0 +1 @@ +global_step60 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d31438b0bfd38acb69501aeb325fee7751b84e8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0ef6f96a48e59aa52c4b471312c2a62378c19acc7ebbae839612b03a7d775a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6759906b0863c54055155658e8d374770ecfc5f9 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab11d533c0fdad46ea8b8e295ba5fdb705e078eeb88cc28f37d82913508766e9 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..784c719c042a2cca1f38818c7e9638aab398c859 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615c168147e3465ce5bfab6da2ff4afc68566ce00ec0f0c6c9fc988038a58d0a +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b32e705bcb6afbb2ab95f5c68c07d0ccc3d457df --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f71e8f8674ecaef9f8cdcbf7ac457a8b8ff15b12694ba2a2fffcb4b43f0f08 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b97b2d3011e43a6dbac487263b52a0b3a55c83 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf6d674dab5545c300a55135f08ca935730a3d35e2c419fb0b333f19482c19 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1acb3d3b1d3de061b997d1dee57e44b465d0630e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2754f2cd8824702f027870d93748b3c0491b0ecd30f1e3d8e937116b2be6151f +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7760bbbcd6d3754ac81a5218adb6e0cd8036905b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385124ac55604598f45ea6e2d141f29456647d3e7c10d12ca64ec93d312be8d +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c4407057d0cb21c08140413cb320528190a941 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416538efaec7391fa8fe782fb15146b83e5612d9e1961292c34c53e964806873 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d24bb2a6ed10249209e94b434ed554cac856d563 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3a6465b9cb557a3a4db2097cdb877b1c624f5f645895d0cd27357a78258aa4 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbb889bb91724b49aa38a4cd85d72f49fee1be46 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.42114258, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60", + "epoch": 1.5789473684210527, + "eval_steps": 20, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 5.9296423681187935, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -0.19545897841453552, + "logits/rejected": 0.32050782442092896, + "logps/chosen": -358.79998779296875, + "logps/rejected": -321.79998779296875, + "loss": 0.598291015625, + "memory(GiB)": 40.2, + "nll_loss": 0.5562499761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 4.737500190734863, + "rewards/rejected": 5.631249904632568, + "step": 25, + "train_speed(iter/s)": 0.282956 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8447788180902983, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.353515625, + "logits/rejected": 0.797656238079071, + "logps/chosen": -244.3000030517578, + "logps/rejected": -307.79998779296875, + "loss": 0.4865478515625, + "memory(GiB)": 40.2, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.324999809265137, + "rewards/margins": 6.90625, + "rewards/rejected": 4.412499904632568, + "step": 30, + "train_speed(iter/s)": 0.284899 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.0119251792419137, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.39433592557907104, + "logits/rejected": 0.728515625, + "logps/chosen": -372.79998779296875, + "logps/rejected": -333.0, + "loss": 0.4839599609375, + "memory(GiB)": 40.2, + "nll_loss": 0.4814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.350000381469727, + "rewards/margins": 9.037500381469727, + "rewards/rejected": 3.3062500953674316, + "step": 35, + "train_speed(iter/s)": 0.286585 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.0093735109752326, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -0.12558594346046448, + "logits/rejected": 0.776171863079071, + "logps/chosen": -264.79998779296875, + "logps/rejected": -355.20001220703125, + "loss": 0.44349365234375, + "memory(GiB)": 40.2, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 10.618749618530273, + "rewards/rejected": 1.6574218273162842, + "step": 40, + "train_speed(iter/s)": 0.288367 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.453125, + "eval_logits/rejected": 1.6875, + "eval_logps/chosen": -251.0, + "eval_logps/rejected": -496.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.4453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.3125, + "eval_rewards/margins": 13.375, + "eval_rewards/rejected": -0.07421875, + "eval_runtime": 1.3301, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.7072609823649598, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -0.19873046875, + "logits/rejected": 1.137304663658142, + "logps/chosen": -304.20001220703125, + "logps/rejected": -368.3999938964844, + "loss": 0.44085693359375, + "memory(GiB)": 40.2, + "nll_loss": 0.4410156309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.087499618530273, + "rewards/margins": 13.5625, + "rewards/rejected": 0.51171875, + "step": 45, + "train_speed(iter/s)": 0.284733 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5247521627933828, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": 0.0985107421875, + "logits/rejected": 1.3796875476837158, + "logps/chosen": -297.3999938964844, + "logps/rejected": -401.6000061035156, + "loss": 0.426397705078125, + "memory(GiB)": 40.2, + "nll_loss": 0.4263671934604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.949999809265137, + "rewards/margins": 13.774999618530273, + "rewards/rejected": 1.167578101158142, + "step": 50, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.27152778325772464, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -0.18417969346046448, + "logits/rejected": 1.4109375476837158, + "logps/chosen": -258.0, + "logps/rejected": -356.3999938964844, + "loss": 0.4094970703125, + "memory(GiB)": 40.2, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.487500190734863, + "rewards/margins": 13.762499809265137, + "rewards/rejected": 0.719531238079071, + "step": 55, + "train_speed(iter/s)": 0.29118 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23345574148400033, + "learning_rate": 5e-05, + "logits/chosen": -0.095703125, + "logits/rejected": 1.083593726158142, + "logps/chosen": -260.79998779296875, + "logps/rejected": -298.6000061035156, + "loss": 0.27684326171875, + "memory(GiB)": 40.2, + "nll_loss": 0.2767578065395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.274999618530273, + "rewards/margins": 13.574999809265137, + "rewards/rejected": 1.7109375, + "step": 60, + "train_speed(iter/s)": 0.293391 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.15625, + "eval_logits/rejected": 1.859375, + "eval_logps/chosen": -239.0, + "eval_logps/rejected": -494.0, + "eval_loss": 0.421142578125, + "eval_nll_loss": 0.421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.625, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": 0.10107421875, + "eval_runtime": 1.3288, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 60 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 76435173146624.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-60/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/README.md b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4e3f02b35c0f2feed1d84197d9fc0806a438243 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/README.md @@ -0,0 +1,202 @@ +--- +base_model: /m2v_intern/wangruotong/logs/Models/deepseek-r1-7b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40645602d2512f13ca6ac33faf22744ab0483e37 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [], + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "up_proj", + "down_proj", + "k_proj", + "v_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_model.safetensors b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82593c4d8a81afbce8b772d98494a5c152bd2d71 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db4f1f38c06a85621c941fe679fa40e61361008aeb50b3662152cc2f47f2f36c +size 40422208 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/additional_config.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/additional_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe5159d1d10a158affb4d328c70025d891e16d8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/additional_config.json @@ -0,0 +1 @@ +{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/args.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/args.json new file mode 100644 index 0000000000000000000000000000000000000000..20902d3b3ef7088a13cabc36bdaf5bde9a4ef216 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/args.json @@ -0,0 +1,374 @@ +{ + "model": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "model_type": "deepseek_r1_distill", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": null, + "num_labels": null, + "rope_scaling": null, + "device_map": null, + "local_repo_path": null, + "template": "deepseek_r1", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "tools_prompt": "react_en", + "norm_bbox": null, + "padding_side": "right", + "loss_scale": "last_round", + "sequence_parallel_size": 1, + "use_chat_template": true, + "template_backend": "swift", + "dataset": [ + "/home/wangruotong/LLM_test/dpo_400_what_0.5.jsonl" + ], + "val_dataset": [], + "split_dataset_ratio": 0.01, + "data_seed": 42, + "dataset_num_proc": 4, + "streaming": false, + "enable_cache": false, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": [ + null, + null + ], + "model_author": [ + null, + null + ], + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.7, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "ckpt_dir": null, + "load_dataset_config": null, + "lora_modules": [], + "tuner_backend": "peft", + "train_type": "lora", + "adapters": [], + "seed": 42, + "model_kwargs": {}, + "load_args": true, + "load_data_args": false, + "use_hf": false, + "hub_token": null, + "custom_register_path": [], + "ignore_args_error": false, + "use_swift_lora": false, + "output_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": false, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 1, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 0.0001, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3.0, + "max_steps": -1, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_ratio": 0.05, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs", + "logging_strategy": "steps", + "logging_first_step": true, + "logging_steps": 5, + "logging_nan_inf_filter": true, + "save_strategy": "steps", + "save_steps": 20.0, + "save_total_limit": 100, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": true, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": null, + "dataloader_drop_last": false, + "eval_steps": 20.0, + "dataloader_num_workers": 4, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": null, + "disable_tqdm": null, + "remove_unused_columns": false, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "fsdp": "", + "fsdp_min_num_params": 0, + "fsdp_config": null, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "dispatch_batches": false + }, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1000000000.0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1000000000.0, + "stage3_max_reuse_distance": 1000000000.0, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [ + "tensorboard" + ], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "evaluation_strategy": "steps", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": null, + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "dispatch_batches": null, + "split_batches": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "freeze_parameters": [], + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [], + "freeze_llm": false, + "freeze_vit": true, + "freeze_aligner": true, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "use_liger": false, + "model_layer_cls_name": null, + "metric_warmup_step": 0, + "fsdp_num": 1, + "acc_steps": 1, + "add_version": true, + "resume_only_model": false, + "check_model": true, + "create_checkpoint_symlink": false, + "packing": false, + "lazy_tokenize": false, + "loss_type": "sigmoid", + "optimizer": null, + "metric": null, + "acc_strategy": "token", + "reward_model": null, + "reward_adapters": [], + "reward_model_type": null, + "reward_model_revision": null, + "num_ppo_epochs": 4, + "whiten_rewards": false, + "kl_coef": 0.05, + "cliprange": 0.2, + "vf_coef": 0.1, + "cliprange_value": 0.2, + "gamma": 1.0, + "lam": 0.95, + "num_mini_batches": 1, + "local_rollout_forward_batch_size": 64, + "num_sample_generations": 10, + "response_length": 512, + "missing_eos_penalty": null, + "rlhf_type": "dpo", + "ref_model": null, + "ref_model_type": null, + "ref_model_revision": null, + "beta": 0.1, + "label_smoothing": 0, + "rpo_alpha": 1.0, + "cpo_alpha": 1.0, + "simpo_gamma": 1, + "desirable_weight": 1.0, + "undesirable_weight": 1.0, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "deepseek-r1-7b", + "model_info": "ModelInfo(model_type='deepseek_r1_distill', model_dir='/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='deepseek_r1_distill', model_groups=[ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=['transformers>=4.37'], tags=[]), ModelGroup(models=[Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', hf_model_id='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='deepseek_r1', get_function=, model_arch='llama', architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=[], tags=[])", + "model_dir": "/m2v_intern/wangruotong/logs/Models/deepseek-r1-7b", + "hub": "", + "training_args": "DPOConfig(output_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs', logging_strategy=, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=, save_steps=20, save_total_limit=100, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=4, dataloader_prefetch_factor=None, past_index=-1, run_name='/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy='steps', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, model_init_kwargs=None, ref_model_init_kwargs=None, model_adapter_name=None, ref_adapter_name=None, force_use_ref_model=False, disable_dropout=True, use_num_logits_to_keep=False, dataset_num_proc=4, padding_value=None, label_pad_token_id=None, max_prompt_length=512, max_completion_length=None, max_length=4096, truncation_mode='keep_end', padding_free=False, precompute_ref_log_probs=False, precompute_ref_batch_size=None, tools=None, loss_type='sigmoid', beta=0.1, f_divergence_type=, f_alpha_divergence_coef=1.0, reference_free=False, label_smoothing=0, use_weighting=False, rpo_alpha=1.0, discopop_tau=0.05, sync_ref_model=False, ref_model_mixup_alpha=0.9, ref_model_sync_steps=64, generate_during_eval=False, is_encoder_decoder=False, acc_strategy='token', sequence_parallel_size=1, check_model=True, train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1, fsdp_num=1, acc_steps=1, train_type='lora', optimizer=None, galore_config=None)" +} \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3657a54539718170ee4cd624b07d3052f02b4fa1 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be251fd6166323b95346e80a7024149eb2ca5249fd43336f3a957868ff6be39 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c615b9c850dc218b066b763369279d3565f1b7f3 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc449974fd7d02c1fa7ef921413741b55ff70186430e64a67a85f8e36a589e4 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b5bb69bd03b1e1588cc8b4f7b2e4fd7f61f356e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61778f92d30da50dfa21f4f32d0e079771b2c636154ef3401c88935da7d92d9e +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d32fcc73c521a05118effc19295e90233ba81833 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a10391b061b3da7ec05fd16e32fe8d73d39ee9b7f9863705b3c433b9fcfe4a3 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..007ed8bf15a6a0d4e5161e0af57efabb936356fa --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca835b837981b7859f1ba9dad21d28a5ae155357c0a4c7a6e0162ae2b4e15982 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d52d7499ac17a38c2d982f2238b4bd9d272fedc8 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb184b24242133896ab6011a6175fc3e964b7a3b0c3c300ec2ba645263ce3a9 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22ad7b65cb0a6fd8ab94d45d9db2e81326dfbd40 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d2b5e17cc387c5d225dc4b20304e5125e7b84ff9416ed70292f8caa44f5cf6 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a641aa618850cb09bfac529ffe189d6266eaff21 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:397c941787a20db3a6a1880841aecf72287b91b8ea5b12118436c56b295162f2 +size 30281648 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..907119852928ed09d6e797d0c8b6b6afbcc39c47 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41be4a3da2e1d9ada88a564e57e68a179d0f4d3015921aa43b816bb2db11d812 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b03895f022afa69ae39102dc977acb8a69da371b --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:903a995694e42d740831ac0f804c917f6d94d6256abc61366d53140608274561 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1f2797a5d6a72fc2ee526fad3243809fc1b0e93 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d612f5cc1f04060a662b881b69eef2573f37a9f123991fb2e4ec8fbf75090f7 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e09da547b800b7ee1fdaf3d0cdba50cd76f720eb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a36f568d392a165e7cdfd23a674624aff885b7e542c817dc85f17338f3d031f +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c21c94cd3a45200995942b6f7205f6116c015b2 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1975d338ad77bacca2045078cb8c938feef232709154e081c7ec29ef6c38ed9 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53218daad5d247dd2ba46b15cb2c014627ad35cb --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b998f129ff41d54973b5648e13a69503899063871132189954c90d31c6f14900 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f4c0dd4b1b825f665721be91ae4d1980f46212 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e07456b371d001f8e5eeeffeafa980aec54e12c55c9485335fd694f01abae179 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f1bba5eeb8a975ae06eef0bdfce3b1800293178 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/global_step80/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e62e8db390fea28ab5c6c90f21d525c6ee93880a05ab99e7a37800e65ef5a34 +size 388374 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/latest b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/latest new file mode 100644 index 0000000000000000000000000000000000000000..75eab498d0366633484ab40334e4b8fb92b16dad --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/latest @@ -0,0 +1 @@ +global_step80 \ No newline at end of file diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_0.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..572d9bd86f4559e91e7b9a4fdc47494e5c6e9568 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d7e02ffb4d440dce7ab4ce0b5617578ec9ce3672acee7434ed6f1153f1ae0c +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_1.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d21df4c1d8717a3994f151fbc05460a0172725e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b40ca759e432b2688f021b81291d74a40f56a205e9842119f7e772275eebd3 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_2.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6048bfa1e35e3b563aec9f5c1c6788496c3f068d --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdaef955ddd36d6bc1c40584113dd6205483e2aa85b02439b8b27e82e02a8359 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_3.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3722ed81a034ae380c794d8b45b0464c00099aa6 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b14ae5db356e6512538751d6b386c190754e307cc99cd652d5c6dd891e1f82 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_4.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..13231ff967baa9c056d5a7ec0cc489a62679039c --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26e28be26826eeeed244b77185c67b443ac185175f8d4bf5ba94caa8b271bc5 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_5.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3401deecf687fd1382dae699b8d2e1a52949a4a --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847cedc1d6ca26f299a132c2ade9754887374acb9d98f26594a85d4c7742d474 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_6.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ab10b8ff32ba08d69bdf75cb904d226b3d9008 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd043d1690ae0ff6991b03322799a0b28f021427b15fd9f1e5ed8b9905d9307 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_7.pth b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c6fb5670c4f108f08c81f04f22272cdd57b7745 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772190f7e6667c865d25fc72da7bdd1b5d39f46fe03bb5c2d754aee1ad3c99c7 +size 15984 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/scheduler.pt b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf96d6803aea265d756d902db3c4cc2386f9742 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90524bcdb94734ac7120e4205110f14662bff8cee00eed50355875dcdc538029 +size 1064 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/trainer_state.json b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3016edd47db2f32aa2ba108a90413d7244d9cc64 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/trainer_state.json @@ -0,0 +1,407 @@ +{ + "best_metric": 0.40429688, + "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80", + "epoch": 2.1052631578947367, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02631578947368421, + "grad_norm": 9.782230723528885, + "learning_rate": 1.6666666666666667e-05, + "logits/chosen": 0.8203125, + "logits/rejected": 0.0322265625, + "logps/chosen": -440.0, + "logps/rejected": -201.0, + "loss": 1.12451171875, + "memory(GiB)": 7.01, + "nll_loss": 0.43359375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "train_speed(iter/s)": 0.124832 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 11.149383195479057, + "learning_rate": 8.333333333333334e-05, + "logits/chosen": -0.10595703125, + "logits/rejected": -0.0537109375, + "logps/chosen": -374.5, + "logps/rejected": -310.25, + "loss": 1.8389892578125, + "memory(GiB)": 16.76, + "nll_loss": 1.171875, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0422515869140625, + "rewards/margins": 0.0543365478515625, + "rewards/rejected": -0.012115478515625, + "step": 5, + "train_speed(iter/s)": 0.24407 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.151375928472369, + "learning_rate": 9.966191788709716e-05, + "logits/chosen": 0.11972656100988388, + "logits/rejected": 0.15791015326976776, + "logps/chosen": -454.79998779296875, + "logps/rejected": -430.20001220703125, + "loss": 1.98125, + "memory(GiB)": 40.2, + "nll_loss": 1.4812500476837158, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.337499976158142, + "rewards/margins": 0.608593761920929, + "rewards/rejected": 0.7310546636581421, + "step": 10, + "train_speed(iter/s)": 0.260504 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.153194634947897, + "learning_rate": 9.829629131445342e-05, + "logits/chosen": -0.266357421875, + "logits/rejected": 0.06972656399011612, + "logps/chosen": -330.0, + "logps/rejected": -321.0, + "loss": 0.8549072265625, + "memory(GiB)": 40.2, + "nll_loss": 0.631640613079071, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.53125, + "rewards/margins": 2.924999952316284, + "rewards/rejected": 2.6078124046325684, + "step": 15, + "train_speed(iter/s)": 0.28201 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 4.3476489647148195, + "learning_rate": 9.591080534401371e-05, + "logits/chosen": -0.601757824420929, + "logits/rejected": 0.5078125, + "logps/chosen": -237.3000030517578, + "logps/rejected": -317.0, + "loss": 0.882763671875, + "memory(GiB)": 40.2, + "nll_loss": 0.7523437738418579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 8.8125, + "rewards/margins": 3.628124952316284, + "rewards/rejected": 5.199999809265137, + "step": 20, + "train_speed(iter/s)": 0.287161 + }, + { + "epoch": 0.5263157894736842, + "eval_logits/chosen": -2.203125, + "eval_logits/rejected": 1.0546875, + "eval_logps/chosen": -282.0, + "eval_logps/rejected": -440.0, + "eval_loss": 0.53515625, + "eval_nll_loss": 0.5234375, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 10.3125, + "eval_rewards/margins": 4.8125, + "eval_rewards/rejected": 5.53125, + "eval_runtime": 1.3114, + "eval_samples_per_second": 3.05, + "eval_steps_per_second": 0.763, + "step": 20 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 5.9296423681187935, + "learning_rate": 9.255583362184999e-05, + "logits/chosen": -0.19545897841453552, + "logits/rejected": 0.32050782442092896, + "logps/chosen": -358.79998779296875, + "logps/rejected": -321.79998779296875, + "loss": 0.598291015625, + "memory(GiB)": 40.2, + "nll_loss": 0.5562499761581421, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.375, + "rewards/margins": 4.737500190734863, + "rewards/rejected": 5.631249904632568, + "step": 25, + "train_speed(iter/s)": 0.282956 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.8447788180902983, + "learning_rate": 8.83022221559489e-05, + "logits/chosen": -0.353515625, + "logits/rejected": 0.797656238079071, + "logps/chosen": -244.3000030517578, + "logps/rejected": -307.79998779296875, + "loss": 0.4865478515625, + "memory(GiB)": 40.2, + "nll_loss": 0.46484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.324999809265137, + "rewards/margins": 6.90625, + "rewards/rejected": 4.412499904632568, + "step": 30, + "train_speed(iter/s)": 0.284899 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.0119251792419137, + "learning_rate": 8.323979328069689e-05, + "logits/chosen": -0.39433592557907104, + "logits/rejected": 0.728515625, + "logps/chosen": -372.79998779296875, + "logps/rejected": -333.0, + "loss": 0.4839599609375, + "memory(GiB)": 40.2, + "nll_loss": 0.4814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.350000381469727, + "rewards/margins": 9.037500381469727, + "rewards/rejected": 3.3062500953674316, + "step": 35, + "train_speed(iter/s)": 0.286585 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.0093735109752326, + "learning_rate": 7.74754489035403e-05, + "logits/chosen": -0.12558594346046448, + "logits/rejected": 0.776171863079071, + "logps/chosen": -264.79998779296875, + "logps/rejected": -355.20001220703125, + "loss": 0.44349365234375, + "memory(GiB)": 40.2, + "nll_loss": 0.55859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 12.287500381469727, + "rewards/margins": 10.618749618530273, + "rewards/rejected": 1.6574218273162842, + "step": 40, + "train_speed(iter/s)": 0.288367 + }, + { + "epoch": 1.0526315789473684, + "eval_logits/chosen": -1.453125, + "eval_logits/rejected": 1.6875, + "eval_logps/chosen": -251.0, + "eval_logps/rejected": -496.0, + "eval_loss": 0.4453125, + "eval_nll_loss": 0.4453125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 13.3125, + "eval_rewards/margins": 13.375, + "eval_rewards/rejected": -0.07421875, + "eval_runtime": 1.3301, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.752, + "step": 40 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.7072609823649598, + "learning_rate": 7.113091308703498e-05, + "logits/chosen": -0.19873046875, + "logits/rejected": 1.137304663658142, + "logps/chosen": -304.20001220703125, + "logps/rejected": -368.3999938964844, + "loss": 0.44085693359375, + "memory(GiB)": 40.2, + "nll_loss": 0.4410156309604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.087499618530273, + "rewards/margins": 13.5625, + "rewards/rejected": 0.51171875, + "step": 45, + "train_speed(iter/s)": 0.284733 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.5247521627933828, + "learning_rate": 6.434016163555452e-05, + "logits/chosen": 0.0985107421875, + "logits/rejected": 1.3796875476837158, + "logps/chosen": -297.3999938964844, + "logps/rejected": -401.6000061035156, + "loss": 0.426397705078125, + "memory(GiB)": 40.2, + "nll_loss": 0.4263671934604645, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.949999809265137, + "rewards/margins": 13.774999618530273, + "rewards/rejected": 1.167578101158142, + "step": 50, + "train_speed(iter/s)": 0.286939 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.27152778325772464, + "learning_rate": 5.724659296536233e-05, + "logits/chosen": -0.18417969346046448, + "logits/rejected": 1.4109375476837158, + "logps/chosen": -258.0, + "logps/rejected": -356.3999938964844, + "loss": 0.4094970703125, + "memory(GiB)": 40.2, + "nll_loss": 0.40937501192092896, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.487500190734863, + "rewards/margins": 13.762499809265137, + "rewards/rejected": 0.719531238079071, + "step": 55, + "train_speed(iter/s)": 0.29118 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.23345574148400033, + "learning_rate": 5e-05, + "logits/chosen": -0.095703125, + "logits/rejected": 1.083593726158142, + "logps/chosen": -260.79998779296875, + "logps/rejected": -298.6000061035156, + "loss": 0.27684326171875, + "memory(GiB)": 40.2, + "nll_loss": 0.2767578065395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.274999618530273, + "rewards/margins": 13.574999809265137, + "rewards/rejected": 1.7109375, + "step": 60, + "train_speed(iter/s)": 0.293391 + }, + { + "epoch": 1.5789473684210527, + "eval_logits/chosen": -1.15625, + "eval_logits/rejected": 1.859375, + "eval_logps/chosen": -239.0, + "eval_logps/rejected": -494.0, + "eval_loss": 0.421142578125, + "eval_nll_loss": 0.421875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 14.625, + "eval_rewards/margins": 14.5, + "eval_rewards/rejected": 0.10107421875, + "eval_runtime": 1.3288, + "eval_samples_per_second": 3.01, + "eval_steps_per_second": 0.753, + "step": 60 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.6365766166484632, + "learning_rate": 4.275340703463767e-05, + "logits/chosen": -0.505664050579071, + "logits/rejected": 1.177148461341858, + "logps/chosen": -259.79998779296875, + "logps/rejected": -357.20001220703125, + "loss": 0.3337158203125, + "memory(GiB)": 40.2, + "nll_loss": 0.33378905057907104, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.637499809265137, + "rewards/margins": 14.550000190734863, + "rewards/rejected": 1.1103515625, + "step": 65, + "train_speed(iter/s)": 0.292443 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.5652403639285394, + "learning_rate": 3.5659838364445505e-05, + "logits/chosen": -0.01230468787252903, + "logits/rejected": 1.0339844226837158, + "logps/chosen": -191.60000610351562, + "logps/rejected": -327.3999938964844, + "loss": 0.3323974609375, + "memory(GiB)": 40.2, + "nll_loss": 0.3324218690395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 14.4375, + "rewards/margins": 13.324999809265137, + "rewards/rejected": 1.1160156726837158, + "step": 70, + "train_speed(iter/s)": 0.292808 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.9034319408629943, + "learning_rate": 2.886908691296504e-05, + "logits/chosen": -0.39921873807907104, + "logits/rejected": 1.267187476158142, + "logps/chosen": -294.20001220703125, + "logps/rejected": -371.20001220703125, + "loss": 0.39881591796875, + "memory(GiB)": 40.2, + "nll_loss": 0.3984375, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.975000381469727, + "rewards/margins": 15.925000190734863, + "rewards/rejected": 0.05000000074505806, + "step": 75, + "train_speed(iter/s)": 0.293937 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.6395555179201285, + "learning_rate": 2.25245510964597e-05, + "logits/chosen": -0.22714844346046448, + "logits/rejected": 1.212499976158142, + "logps/chosen": -236.60000610351562, + "logps/rejected": -388.0, + "loss": 0.356890869140625, + "memory(GiB)": 40.2, + "nll_loss": 0.3724609315395355, + "rewards/accuracies": 1.0, + "rewards/chosen": 15.337499618530273, + "rewards/margins": 15.037500381469727, + "rewards/rejected": 0.32695311307907104, + "step": 80, + "train_speed(iter/s)": 0.292172 + }, + { + "epoch": 2.1052631578947367, + "eval_logits/chosen": -1.1484375, + "eval_logits/rejected": 1.8125, + "eval_logps/chosen": -233.0, + "eval_logps/rejected": -490.0, + "eval_loss": 0.404296875, + "eval_nll_loss": 0.404296875, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 15.1875, + "eval_rewards/margins": 14.5625, + "eval_rewards/rejected": 0.625, + "eval_runtime": 1.3795, + "eval_samples_per_second": 2.9, + "eval_steps_per_second": 0.725, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 114, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 102266984464384.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/training_args.bin b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..29938827121efe95d48f066a4d025bf5a409b6bd --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa79d06bb4e33d66ebd6b0d7dd842a9764bbfb50d6568115915200fbf4b64794 +size 9016 diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/zero_to_fp32.py b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-80/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..16aec1cbd470dd6bc4ff11e33b0dd0fc611751e5 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..3d8b7cda87a872bb018bb94c31842a96889096ba Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logits_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..f2f6715f4b054c5f532266c9c2df8625a789a250 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..721bc90eacc9f7ada4b2dee1a39d47418a59fabb Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_logps_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_loss.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..d21828e5854458c02068947fb3b7df17bb5089e7 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_nll_loss.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..a0b6eabcf9c9ef00465123955082f63a130649b8 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_nll_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_accuracies.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..b359853ac747638a7e44efa30e8bcf81f3befcfc Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_accuracies.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..db97637a60a3dd670f453e3f9c821782567003c2 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_margins.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..a5c9cd291894de084a11682920fe5dc272d632fe Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_margins.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c31a1d5e13cff0f79e2b3cb67b6b5e75899fff9f Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_rewards_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_runtime.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..2406761e49bcfa08df316a07ee2be0718d762b63 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_runtime.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_samples_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..753fe110db8b3366ab6ad098081e471f4da40acd Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_samples_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_steps_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..510f60c58b6828ff53816ecd5203457905e006d1 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/eval_steps_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_epoch.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..c152c9bbb1100d680125e899d3d0da3db98dc221 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_epoch.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_grad_norm.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..0145a25d0c0b3bf0db504c7d6db28b758923b407 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_grad_norm.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_learning_rate.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..e35e42c173f451f9666ba448872c615528441c9e Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_learning_rate.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..02fe50e53b737b539497df7be22ee70aab71e2b3 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..329deec68e090aa826428bec4305800bb1b787f2 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logits_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..3fb8e79dc58d9891e0835b3ddb7d4391c342cec0 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..fb196ef67fe1e87bdb6f893078a3f0806acac8f8 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_logps_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_loss.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..59304972200de53461a941c7c04fa67e1a313bc4 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_memory(GiB).png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..2f4c0343fd95e1696f33d506783fc994c75c0959 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_memory(GiB).png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_nll_loss.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_nll_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..cd57895fcf66382594cd6279c4a8bf01f8c9fc50 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_nll_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_accuracies.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_accuracies.png new file mode 100644 index 0000000000000000000000000000000000000000..f39f0804413098b287a2a80e46ba44bf957cb47f Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_accuracies.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_chosen.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_chosen.png new file mode 100644 index 0000000000000000000000000000000000000000..6365defc65028925dc9c2aa39591fe5dac0508d2 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_chosen.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_margins.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_margins.png new file mode 100644 index 0000000000000000000000000000000000000000..f0fede1195bdf7d95005224582b0c4e4b0d94512 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_margins.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_rejected.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_rejected.png new file mode 100644 index 0000000000000000000000000000000000000000..c01b47fb83d23fba3b42b4ffc52b218e847a744e Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_rewards_rejected.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_total_flos.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..7119fb287a0366bae3332192bcf97058d1f8d746 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_total_flos.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_loss.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..ca1f9167226500e1783be5f7bd01d4ce7fe2267f Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_loss.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_runtime.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_runtime.png new file mode 100644 index 0000000000000000000000000000000000000000..f64c143feb6774c55d2fa5a0cc13275292cee43f Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_runtime.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_samples_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_samples_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..f59b2fe40dba1504d57f051678e9d8f1b3495854 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_samples_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_speed(iter_s).png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_speed(iter_s).png new file mode 100644 index 0000000000000000000000000000000000000000..faf9e12d6f28f3a7e86dc99c7c75563ec6cfd5bc Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_speed(iter_s).png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_steps_per_second.png b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_steps_per_second.png new file mode 100644 index 0000000000000000000000000000000000000000..7c391918b2652468fad325f09c2b2815000466f3 Binary files /dev/null and b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/images/train_train_steps_per_second.png differ diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/logging.jsonl b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3af66e2f39ee3264786195de1d792312a1d3ae74 --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/logging.jsonl @@ -0,0 +1,31 @@ +{"loss": 1.12451172, "grad_norm": 9.78223072, "learning_rate": 1.667e-05, "memory(GiB)": 7.01, "train_speed(iter/s)": 0.124832, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -440.0, "logps/rejected": -201.0, "logits/chosen": 0.8203125, "logits/rejected": 0.03222656, "nll_loss": 0.43359375, "epoch": 0.02631579, "global_step/max_steps": "1/114", "percentage": "0.88%", "elapsed_time": "5s", "remaining_time": "9m 47s"} +{"loss": 1.83898926, "grad_norm": 11.1493832, "learning_rate": 8.333e-05, "memory(GiB)": 16.76, "train_speed(iter/s)": 0.24407, "rewards/chosen": 0.04225159, "rewards/rejected": -0.01211548, "rewards/accuracies": 0.34375, "rewards/margins": 0.05433655, "logps/chosen": -374.5, "logps/rejected": -310.25, "logits/chosen": -0.10595703, "logits/rejected": -0.05371094, "nll_loss": 1.171875, "epoch": 0.13157895, "global_step/max_steps": "5/114", "percentage": "4.39%", "elapsed_time": "17s", "remaining_time": "6m 25s"} +{"loss": 1.98125, "grad_norm": 10.15137593, "learning_rate": 9.966e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.260504, "rewards/chosen": 1.33749998, "rewards/rejected": 0.73105466, "rewards/accuracies": 0.77499998, "rewards/margins": 0.60859376, "logps/chosen": -454.79998779, "logps/rejected": -430.20001221, "logits/chosen": 0.11972656, "logits/rejected": 0.15791015, "nll_loss": 1.48125005, "epoch": 0.26315789, "global_step/max_steps": "10/114", "percentage": "8.77%", "elapsed_time": "35s", "remaining_time": "6m 10s"} +{"loss": 0.85490723, "grad_norm": 4.15319463, "learning_rate": 9.83e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.28201, "rewards/chosen": 5.53125, "rewards/rejected": 2.6078124, "rewards/accuracies": 0.875, "rewards/margins": 2.92499995, "logps/chosen": -330.0, "logps/rejected": -321.0, "logits/chosen": -0.26635742, "logits/rejected": 0.06972656, "nll_loss": 0.63164061, "epoch": 0.39473684, "global_step/max_steps": "15/114", "percentage": "13.16%", "elapsed_time": "50s", "remaining_time": "5m 32s"} +{"loss": 0.88276367, "grad_norm": 4.34764896, "learning_rate": 9.591e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.287161, "rewards/chosen": 8.8125, "rewards/rejected": 5.19999981, "rewards/accuracies": 0.97500002, "rewards/margins": 3.62812495, "logps/chosen": -237.30000305, "logps/rejected": -317.0, "logits/chosen": -0.60175782, "logits/rejected": 0.5078125, "nll_loss": 0.75234377, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "1m 6s", "remaining_time": "5m 14s"} +{"eval_loss": 0.53515625, "eval_runtime": 1.3114, "eval_samples_per_second": 3.05, "eval_steps_per_second": 0.763, "eval_rewards/chosen": 10.3125, "eval_rewards/rejected": 5.53125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.8125, "eval_logps/chosen": -282.0, "eval_logps/rejected": -440.0, "eval_logits/chosen": -2.203125, "eval_logits/rejected": 1.0546875, "eval_nll_loss": 0.5234375, "epoch": 0.52631579, "global_step/max_steps": "20/114", "percentage": "17.54%", "elapsed_time": "1m 8s", "remaining_time": "5m 20s"} +{"loss": 0.59829102, "grad_norm": 5.92964237, "learning_rate": 9.256e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.282956, "rewards/chosen": 10.375, "rewards/rejected": 5.6312499, "rewards/accuracies": 1.0, "rewards/margins": 4.73750019, "logps/chosen": -358.79998779, "logps/rejected": -321.79998779, "logits/chosen": -0.19545898, "logits/rejected": 0.32050782, "nll_loss": 0.55624998, "epoch": 0.65789474, "global_step/max_steps": "25/114", "percentage": "21.93%", "elapsed_time": "1m 25s", "remaining_time": "5m 4s"} +{"loss": 0.48654785, "grad_norm": 0.84477882, "learning_rate": 8.83e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.284899, "rewards/chosen": 11.32499981, "rewards/rejected": 4.4124999, "rewards/accuracies": 1.0, "rewards/margins": 6.90625, "logps/chosen": -244.30000305, "logps/rejected": -307.79998779, "logits/chosen": -0.35351562, "logits/rejected": 0.79765624, "nll_loss": 0.46484375, "epoch": 0.78947368, "global_step/max_steps": "30/114", "percentage": "26.32%", "elapsed_time": "1m 42s", "remaining_time": "4m 46s"} +{"loss": 0.48395996, "grad_norm": 1.01192518, "learning_rate": 8.324e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.286585, "rewards/chosen": 12.35000038, "rewards/rejected": 3.3062501, "rewards/accuracies": 1.0, "rewards/margins": 9.03750038, "logps/chosen": -372.79998779, "logps/rejected": -333.0, "logits/chosen": -0.39433593, "logits/rejected": 0.72851562, "nll_loss": 0.48144531, "epoch": 0.92105263, "global_step/max_steps": "35/114", "percentage": "30.70%", "elapsed_time": "1m 59s", "remaining_time": "4m 29s"} +{"loss": 0.44349365, "grad_norm": 1.00937351, "learning_rate": 7.748e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.288367, "rewards/chosen": 12.28750038, "rewards/rejected": 1.65742183, "rewards/accuracies": 1.0, "rewards/margins": 10.61874962, "logps/chosen": -264.79998779, "logps/rejected": -355.20001221, "logits/chosen": -0.12558594, "logits/rejected": 0.77617186, "nll_loss": 0.55859375, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "2m 15s", "remaining_time": "4m 11s"} +{"eval_loss": 0.4453125, "eval_runtime": 1.3301, "eval_samples_per_second": 3.007, "eval_steps_per_second": 0.752, "eval_rewards/chosen": 13.3125, "eval_rewards/rejected": -0.07421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.375, "eval_logps/chosen": -251.0, "eval_logps/rejected": -496.0, "eval_logits/chosen": -1.453125, "eval_logits/rejected": 1.6875, "eval_nll_loss": 0.4453125, "epoch": 1.05263158, "global_step/max_steps": "40/114", "percentage": "35.09%", "elapsed_time": "2m 17s", "remaining_time": "4m 13s"} +{"loss": 0.44085693, "grad_norm": 0.70726098, "learning_rate": 7.113e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.284733, "rewards/chosen": 14.08749962, "rewards/rejected": 0.51171875, "rewards/accuracies": 1.0, "rewards/margins": 13.5625, "logps/chosen": -304.20001221, "logps/rejected": -368.3999939, "logits/chosen": -0.19873047, "logits/rejected": 1.13730466, "nll_loss": 0.44101563, "epoch": 1.18421053, "global_step/max_steps": "45/114", "percentage": "39.47%", "elapsed_time": "2m 35s", "remaining_time": "3m 58s"} +{"loss": 0.42639771, "grad_norm": 0.52475216, "learning_rate": 6.434e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.286939, "rewards/chosen": 14.94999981, "rewards/rejected": 1.1675781, "rewards/accuracies": 1.0, "rewards/margins": 13.77499962, "logps/chosen": -297.3999939, "logps/rejected": -401.6000061, "logits/chosen": 0.09851074, "logits/rejected": 1.37968755, "nll_loss": 0.42636719, "epoch": 1.31578947, "global_step/max_steps": "50/114", "percentage": "43.86%", "elapsed_time": "2m 51s", "remaining_time": "3m 39s"} +{"loss": 0.40949707, "grad_norm": 0.27152778, "learning_rate": 5.725e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.29118, "rewards/chosen": 14.48750019, "rewards/rejected": 0.71953124, "rewards/accuracies": 1.0, "rewards/margins": 13.76249981, "logps/chosen": -258.0, "logps/rejected": -356.3999939, "logits/chosen": -0.18417969, "logits/rejected": 1.41093755, "nll_loss": 0.40937501, "epoch": 1.44736842, "global_step/max_steps": "55/114", "percentage": "48.25%", "elapsed_time": "3m 6s", "remaining_time": "3m 19s"} +{"loss": 0.27684326, "grad_norm": 0.23345574, "learning_rate": 5e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293391, "rewards/chosen": 15.27499962, "rewards/rejected": 1.7109375, "rewards/accuracies": 1.0, "rewards/margins": 13.57499981, "logps/chosen": -260.79998779, "logps/rejected": -298.6000061, "logits/chosen": -0.09570312, "logits/rejected": 1.08359373, "nll_loss": 0.27675781, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "3m 21s", "remaining_time": "3m 1s"} +{"eval_loss": 0.42114258, "eval_runtime": 1.3288, "eval_samples_per_second": 3.01, "eval_steps_per_second": 0.753, "eval_rewards/chosen": 14.625, "eval_rewards/rejected": 0.10107422, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/chosen": -239.0, "eval_logps/rejected": -494.0, "eval_logits/chosen": -1.15625, "eval_logits/rejected": 1.859375, "eval_nll_loss": 0.421875, "epoch": 1.57894737, "global_step/max_steps": "60/114", "percentage": "52.63%", "elapsed_time": "3m 23s", "remaining_time": "3m 2s"} +{"loss": 0.33371582, "grad_norm": 0.63657662, "learning_rate": 4.275e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292443, "rewards/chosen": 15.63749981, "rewards/rejected": 1.11035156, "rewards/accuracies": 1.0, "rewards/margins": 14.55000019, "logps/chosen": -259.79998779, "logps/rejected": -357.20001221, "logits/chosen": -0.50566405, "logits/rejected": 1.17714846, "nll_loss": 0.33378905, "epoch": 1.71052632, "global_step/max_steps": "65/114", "percentage": "57.02%", "elapsed_time": "3m 39s", "remaining_time": "2m 45s"} +{"loss": 0.33239746, "grad_norm": 0.56524036, "learning_rate": 3.566e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292808, "rewards/chosen": 14.4375, "rewards/rejected": 1.11601567, "rewards/accuracies": 1.0, "rewards/margins": 13.32499981, "logps/chosen": -191.6000061, "logps/rejected": -327.3999939, "logits/chosen": -0.01230469, "logits/rejected": 1.03398442, "nll_loss": 0.33242187, "epoch": 1.84210526, "global_step/max_steps": "70/114", "percentage": "61.40%", "elapsed_time": "3m 56s", "remaining_time": "2m 28s"} +{"loss": 0.39881592, "grad_norm": 0.90343194, "learning_rate": 2.887e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293937, "rewards/chosen": 15.97500038, "rewards/rejected": 0.05, "rewards/accuracies": 1.0, "rewards/margins": 15.92500019, "logps/chosen": -294.20001221, "logps/rejected": -371.20001221, "logits/chosen": -0.39921874, "logits/rejected": 1.26718748, "nll_loss": 0.3984375, "epoch": 1.97368421, "global_step/max_steps": "75/114", "percentage": "65.79%", "elapsed_time": "4m 12s", "remaining_time": "2m 11s"} +{"loss": 0.35689087, "grad_norm": 0.63955552, "learning_rate": 2.252e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292172, "rewards/chosen": 15.33749962, "rewards/rejected": 0.32695311, "rewards/accuracies": 1.0, "rewards/margins": 15.03750038, "logps/chosen": -236.6000061, "logps/rejected": -388.0, "logits/chosen": -0.22714844, "logits/rejected": 1.21249998, "nll_loss": 0.37246093, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "4m 31s", "remaining_time": "1m 55s"} +{"eval_loss": 0.40429688, "eval_runtime": 1.3795, "eval_samples_per_second": 2.9, "eval_steps_per_second": 0.725, "eval_rewards/chosen": 15.1875, "eval_rewards/rejected": 0.625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5625, "eval_logps/chosen": -233.0, "eval_logps/rejected": -490.0, "eval_logits/chosen": -1.1484375, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.40429688, "epoch": 2.10526316, "global_step/max_steps": "80/114", "percentage": "70.18%", "elapsed_time": "4m 32s", "remaining_time": "1m 55s"} +{"loss": 0.34420776, "grad_norm": 0.83552608, "learning_rate": 1.676e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.291482, "rewards/chosen": 16.0625, "rewards/rejected": 1.1875, "rewards/accuracies": 1.0, "rewards/margins": 14.85000038, "logps/chosen": -249.6000061, "logps/rejected": -412.3999939, "logits/chosen": -0.32705078, "logits/rejected": 1.42734373, "nll_loss": 0.34394532, "epoch": 2.23684211, "global_step/max_steps": "85/114", "percentage": "74.56%", "elapsed_time": "4m 48s", "remaining_time": "1m 38s"} +{"loss": 0.32097778, "grad_norm": 0.30799154, "learning_rate": 1.17e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293824, "rewards/chosen": 16.20000076, "rewards/rejected": 1.39882815, "rewards/accuracies": 1.0, "rewards/margins": 14.76249981, "logps/chosen": -251.80000305, "logps/rejected": -320.0, "logits/chosen": -0.44003907, "logits/rejected": 0.81835938, "nll_loss": 0.32109374, "epoch": 2.36842105, "global_step/max_steps": "90/114", "percentage": "78.95%", "elapsed_time": "5m 3s", "remaining_time": "1m 20s"} +{"loss": 0.41552734, "grad_norm": 0.25398681, "learning_rate": 7.44e-06, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293791, "rewards/chosen": 17.29999924, "rewards/rejected": 0.38164061, "rewards/accuracies": 1.0, "rewards/margins": 16.95000076, "logps/chosen": -312.20001221, "logps/rejected": -399.6000061, "logits/chosen": -0.246875, "logits/rejected": 1.1484375, "nll_loss": 0.41523439, "epoch": 2.5, "global_step/max_steps": "95/114", "percentage": "83.33%", "elapsed_time": "5m 20s", "remaining_time": "1m 4s"} +{"loss": 0.3494751, "grad_norm": 0.45383845, "learning_rate": 4.09e-06, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.295106, "rewards/chosen": 16.52499962, "rewards/rejected": 0.69160157, "rewards/accuracies": 1.0, "rewards/margins": 15.83749962, "logps/chosen": -243.3999939, "logps/rejected": -329.6000061, "logits/chosen": -0.55219728, "logits/rejected": 1.42812502, "nll_loss": 0.34902343, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "5m 36s", "remaining_time": "47s"} +{"eval_loss": 0.40234375, "eval_runtime": 1.3229, "eval_samples_per_second": 3.024, "eval_steps_per_second": 0.756, "eval_rewards/chosen": 15.375, "eval_rewards/rejected": 0.30078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.125, "eval_logps/chosen": -231.0, "eval_logps/rejected": -492.0, "eval_logits/chosen": -1.109375, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.40234375, "epoch": 2.63157895, "global_step/max_steps": "100/114", "percentage": "87.72%", "elapsed_time": "5m 37s", "remaining_time": "47s"} +{"loss": 0.3121521, "grad_norm": 0.79372006, "learning_rate": 1.7e-06, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.293988, "rewards/chosen": 16.125, "rewards/rejected": 1.4679687, "rewards/accuracies": 1.0, "rewards/margins": 14.67500019, "logps/chosen": -223.80000305, "logps/rejected": -309.0, "logits/chosen": -0.35712892, "logits/rejected": 1.04921877, "nll_loss": 0.31210938, "epoch": 2.76315789, "global_step/max_steps": "105/114", "percentage": "92.11%", "elapsed_time": "5m 54s", "remaining_time": "30s"} +{"loss": 0.30657806, "grad_norm": 0.44121696, "learning_rate": 3.4e-07, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.294539, "rewards/chosen": 16.125, "rewards/rejected": 1.65312505, "rewards/accuracies": 1.0, "rewards/margins": 14.48750019, "logps/chosen": -209.19999695, "logps/rejected": -315.0, "logits/chosen": -0.43496093, "logits/rejected": 1.49843752, "nll_loss": 0.30664062, "epoch": 2.89473684, "global_step/max_steps": "110/114", "percentage": "96.49%", "elapsed_time": "6m 10s", "remaining_time": "13s"} +{"eval_loss": 0.40185547, "eval_runtime": 1.2824, "eval_samples_per_second": 3.119, "eval_steps_per_second": 0.78, "eval_rewards/chosen": 15.375, "eval_rewards/rejected": 0.30078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.125, "eval_logps/chosen": -231.0, "eval_logps/rejected": -492.0, "eval_logits/chosen": -1.1171875, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.40234375, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "6m 26s", "remaining_time": "0s"} +{"train_runtime": 387.4342, "train_samples_per_second": 2.307, "train_steps_per_second": 0.294, "total_flos": 144718705983488.0, "train_loss": 0.55275492, "epoch": 3.0, "global_step/max_steps": "114/114", "percentage": "100.00%", "elapsed_time": "6m 27s", "remaining_time": "0s"} +{"train_dataset": "1695.382550±899.293489, min=182.000000, max=4081.000000, size=298", "val_dataset": "1637.250000±797.581461, min=755.000000, max=2485.000000, size=4", "model_parameter_info": "PeftModelForCausalLM: 7635.8016M Params (20.1851M Trainable [0.2643%]), 0.0001M Buffers.", "last_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114", "best_model_checkpoint": "/m2v_intern/wangruotong/logs/output_deepseek_dpo/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/checkpoint-114", "best_metric": 0.40185547, "global_step": 114, "log_history": [{"loss": 1.12451171875, "grad_norm": 9.782230723528885, "learning_rate": 1.6666666666666667e-05, "memory(GiB)": 7.01, "train_speed(iter/s)": 0.124832, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -440.0, "logps/rejected": -201.0, "logits/chosen": 0.8203125, "logits/rejected": 0.0322265625, "nll_loss": 0.43359375, "epoch": 0.02631578947368421, "step": 1}, {"loss": 1.8389892578125, "grad_norm": 11.149383195479057, "learning_rate": 8.333333333333334e-05, "memory(GiB)": 16.76, "train_speed(iter/s)": 0.24407, "rewards/chosen": 0.0422515869140625, "rewards/rejected": -0.012115478515625, "rewards/accuracies": 0.34375, "rewards/margins": 0.0543365478515625, "logps/chosen": -374.5, "logps/rejected": -310.25, "logits/chosen": -0.10595703125, "logits/rejected": -0.0537109375, "nll_loss": 1.171875, "epoch": 0.13157894736842105, "step": 5}, {"loss": 1.98125, "grad_norm": 10.151375928472369, "learning_rate": 9.966191788709716e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.260504, "rewards/chosen": 1.337499976158142, "rewards/rejected": 0.7310546636581421, "rewards/accuracies": 0.7749999761581421, "rewards/margins": 0.608593761920929, "logps/chosen": -454.79998779296875, "logps/rejected": -430.20001220703125, "logits/chosen": 0.11972656100988388, "logits/rejected": 0.15791015326976776, "nll_loss": 1.4812500476837158, "epoch": 0.2631578947368421, "step": 10}, {"loss": 0.8549072265625, "grad_norm": 4.153194634947897, "learning_rate": 9.829629131445342e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.28201, "rewards/chosen": 5.53125, "rewards/rejected": 2.6078124046325684, "rewards/accuracies": 0.875, "rewards/margins": 2.924999952316284, "logps/chosen": -330.0, "logps/rejected": -321.0, "logits/chosen": -0.266357421875, "logits/rejected": 0.06972656399011612, "nll_loss": 0.631640613079071, "epoch": 0.39473684210526316, "step": 15}, {"loss": 0.882763671875, "grad_norm": 4.3476489647148195, "learning_rate": 9.591080534401371e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.287161, "rewards/chosen": 8.8125, "rewards/rejected": 5.199999809265137, "rewards/accuracies": 0.9750000238418579, "rewards/margins": 3.628124952316284, "logps/chosen": -237.3000030517578, "logps/rejected": -317.0, "logits/chosen": -0.601757824420929, "logits/rejected": 0.5078125, "nll_loss": 0.7523437738418579, "epoch": 0.5263157894736842, "step": 20}, {"eval_loss": 0.53515625, "eval_runtime": 1.3114, "eval_samples_per_second": 3.05, "eval_steps_per_second": 0.763, "eval_rewards/chosen": 10.3125, "eval_rewards/rejected": 5.53125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 4.8125, "eval_logps/chosen": -282.0, "eval_logps/rejected": -440.0, "eval_logits/chosen": -2.203125, "eval_logits/rejected": 1.0546875, "eval_nll_loss": 0.5234375, "epoch": 0.5263157894736842, "step": 20}, {"loss": 0.598291015625, "grad_norm": 5.9296423681187935, "learning_rate": 9.255583362184999e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.282956, "rewards/chosen": 10.375, "rewards/rejected": 5.631249904632568, "rewards/accuracies": 1.0, "rewards/margins": 4.737500190734863, "logps/chosen": -358.79998779296875, "logps/rejected": -321.79998779296875, "logits/chosen": -0.19545897841453552, "logits/rejected": 0.32050782442092896, "nll_loss": 0.5562499761581421, "epoch": 0.6578947368421053, "step": 25}, {"loss": 0.4865478515625, "grad_norm": 0.8447788180902983, "learning_rate": 8.83022221559489e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.284899, "rewards/chosen": 11.324999809265137, "rewards/rejected": 4.412499904632568, "rewards/accuracies": 1.0, "rewards/margins": 6.90625, "logps/chosen": -244.3000030517578, "logps/rejected": -307.79998779296875, "logits/chosen": -0.353515625, "logits/rejected": 0.797656238079071, "nll_loss": 0.46484375, "epoch": 0.7894736842105263, "step": 30}, {"loss": 0.4839599609375, "grad_norm": 1.0119251792419137, "learning_rate": 8.323979328069689e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.286585, "rewards/chosen": 12.350000381469727, "rewards/rejected": 3.3062500953674316, "rewards/accuracies": 1.0, "rewards/margins": 9.037500381469727, "logps/chosen": -372.79998779296875, "logps/rejected": -333.0, "logits/chosen": -0.39433592557907104, "logits/rejected": 0.728515625, "nll_loss": 0.4814453125, "epoch": 0.9210526315789473, "step": 35}, {"loss": 0.44349365234375, "grad_norm": 1.0093735109752326, "learning_rate": 7.74754489035403e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.288367, "rewards/chosen": 12.287500381469727, "rewards/rejected": 1.6574218273162842, "rewards/accuracies": 1.0, "rewards/margins": 10.618749618530273, "logps/chosen": -264.79998779296875, "logps/rejected": -355.20001220703125, "logits/chosen": -0.12558594346046448, "logits/rejected": 0.776171863079071, "nll_loss": 0.55859375, "epoch": 1.0526315789473684, "step": 40}, {"eval_loss": 0.4453125, "eval_runtime": 1.3301, "eval_samples_per_second": 3.007, "eval_steps_per_second": 0.752, "eval_rewards/chosen": 13.3125, "eval_rewards/rejected": -0.07421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 13.375, "eval_logps/chosen": -251.0, "eval_logps/rejected": -496.0, "eval_logits/chosen": -1.453125, "eval_logits/rejected": 1.6875, "eval_nll_loss": 0.4453125, "epoch": 1.0526315789473684, "step": 40}, {"loss": 0.44085693359375, "grad_norm": 0.7072609823649598, "learning_rate": 7.113091308703498e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.284733, "rewards/chosen": 14.087499618530273, "rewards/rejected": 0.51171875, "rewards/accuracies": 1.0, "rewards/margins": 13.5625, "logps/chosen": -304.20001220703125, "logps/rejected": -368.3999938964844, "logits/chosen": -0.19873046875, "logits/rejected": 1.137304663658142, "nll_loss": 0.4410156309604645, "epoch": 1.1842105263157894, "step": 45}, {"loss": 0.426397705078125, "grad_norm": 0.5247521627933828, "learning_rate": 6.434016163555452e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.286939, "rewards/chosen": 14.949999809265137, "rewards/rejected": 1.167578101158142, "rewards/accuracies": 1.0, "rewards/margins": 13.774999618530273, "logps/chosen": -297.3999938964844, "logps/rejected": -401.6000061035156, "logits/chosen": 0.0985107421875, "logits/rejected": 1.3796875476837158, "nll_loss": 0.4263671934604645, "epoch": 1.3157894736842106, "step": 50}, {"loss": 0.4094970703125, "grad_norm": 0.27152778325772464, "learning_rate": 5.724659296536233e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.29118, "rewards/chosen": 14.487500190734863, "rewards/rejected": 0.719531238079071, "rewards/accuracies": 1.0, "rewards/margins": 13.762499809265137, "logps/chosen": -258.0, "logps/rejected": -356.3999938964844, "logits/chosen": -0.18417969346046448, "logits/rejected": 1.4109375476837158, "nll_loss": 0.40937501192092896, "epoch": 1.4473684210526316, "step": 55}, {"loss": 0.27684326171875, "grad_norm": 0.23345574148400033, "learning_rate": 5e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293391, "rewards/chosen": 15.274999618530273, "rewards/rejected": 1.7109375, "rewards/accuracies": 1.0, "rewards/margins": 13.574999809265137, "logps/chosen": -260.79998779296875, "logps/rejected": -298.6000061035156, "logits/chosen": -0.095703125, "logits/rejected": 1.083593726158142, "nll_loss": 0.2767578065395355, "epoch": 1.5789473684210527, "step": 60}, {"eval_loss": 0.421142578125, "eval_runtime": 1.3288, "eval_samples_per_second": 3.01, "eval_steps_per_second": 0.753, "eval_rewards/chosen": 14.625, "eval_rewards/rejected": 0.10107421875, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5, "eval_logps/chosen": -239.0, "eval_logps/rejected": -494.0, "eval_logits/chosen": -1.15625, "eval_logits/rejected": 1.859375, "eval_nll_loss": 0.421875, "epoch": 1.5789473684210527, "step": 60}, {"loss": 0.3337158203125, "grad_norm": 0.6365766166484632, "learning_rate": 4.275340703463767e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292443, "rewards/chosen": 15.637499809265137, "rewards/rejected": 1.1103515625, "rewards/accuracies": 1.0, "rewards/margins": 14.550000190734863, "logps/chosen": -259.79998779296875, "logps/rejected": -357.20001220703125, "logits/chosen": -0.505664050579071, "logits/rejected": 1.177148461341858, "nll_loss": 0.33378905057907104, "epoch": 1.7105263157894737, "step": 65}, {"loss": 0.3323974609375, "grad_norm": 0.5652403639285394, "learning_rate": 3.5659838364445505e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292808, "rewards/chosen": 14.4375, "rewards/rejected": 1.1160156726837158, "rewards/accuracies": 1.0, "rewards/margins": 13.324999809265137, "logps/chosen": -191.60000610351562, "logps/rejected": -327.3999938964844, "logits/chosen": -0.01230468787252903, "logits/rejected": 1.0339844226837158, "nll_loss": 0.3324218690395355, "epoch": 1.8421052631578947, "step": 70}, {"loss": 0.39881591796875, "grad_norm": 0.9034319408629943, "learning_rate": 2.886908691296504e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293937, "rewards/chosen": 15.975000381469727, "rewards/rejected": 0.05000000074505806, "rewards/accuracies": 1.0, "rewards/margins": 15.925000190734863, "logps/chosen": -294.20001220703125, "logps/rejected": -371.20001220703125, "logits/chosen": -0.39921873807907104, "logits/rejected": 1.267187476158142, "nll_loss": 0.3984375, "epoch": 1.973684210526316, "step": 75}, {"loss": 0.356890869140625, "grad_norm": 0.6395555179201285, "learning_rate": 2.25245510964597e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.292172, "rewards/chosen": 15.337499618530273, "rewards/rejected": 0.32695311307907104, "rewards/accuracies": 1.0, "rewards/margins": 15.037500381469727, "logps/chosen": -236.60000610351562, "logps/rejected": -388.0, "logits/chosen": -0.22714844346046448, "logits/rejected": 1.212499976158142, "nll_loss": 0.3724609315395355, "epoch": 2.1052631578947367, "step": 80}, {"eval_loss": 0.404296875, "eval_runtime": 1.3795, "eval_samples_per_second": 2.9, "eval_steps_per_second": 0.725, "eval_rewards/chosen": 15.1875, "eval_rewards/rejected": 0.625, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 14.5625, "eval_logps/chosen": -233.0, "eval_logps/rejected": -490.0, "eval_logits/chosen": -1.1484375, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.404296875, "epoch": 2.1052631578947367, "step": 80}, {"loss": 0.344207763671875, "grad_norm": 0.8355260792737639, "learning_rate": 1.6760206719303105e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.291482, "rewards/chosen": 16.0625, "rewards/rejected": 1.1875, "rewards/accuracies": 1.0, "rewards/margins": 14.850000381469727, "logps/chosen": -249.60000610351562, "logps/rejected": -412.3999938964844, "logits/chosen": -0.3270507752895355, "logits/rejected": 1.427343726158142, "nll_loss": 0.34394532442092896, "epoch": 2.236842105263158, "step": 85}, {"loss": 0.320977783203125, "grad_norm": 0.30799153697843773, "learning_rate": 1.1697777844051105e-05, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293824, "rewards/chosen": 16.200000762939453, "rewards/rejected": 1.398828148841858, "rewards/accuracies": 1.0, "rewards/margins": 14.762499809265137, "logps/chosen": -251.8000030517578, "logps/rejected": -320.0, "logits/chosen": -0.4400390684604645, "logits/rejected": 0.818359375, "nll_loss": 0.32109373807907104, "epoch": 2.3684210526315788, "step": 90}, {"loss": 0.41552734375, "grad_norm": 0.2539868135825997, "learning_rate": 7.444166378150013e-06, "memory(GiB)": 40.2, "train_speed(iter/s)": 0.293791, "rewards/chosen": 17.299999237060547, "rewards/rejected": 0.38164061307907104, "rewards/accuracies": 1.0, "rewards/margins": 16.950000762939453, "logps/chosen": -312.20001220703125, "logps/rejected": -399.6000061035156, "logits/chosen": -0.24687500298023224, "logits/rejected": 1.1484375, "nll_loss": 0.41523438692092896, "epoch": 2.5, "step": 95}, {"loss": 0.34947509765625, "grad_norm": 0.4538384538590968, "learning_rate": 4.089194655986306e-06, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.295106, "rewards/chosen": 16.524999618530273, "rewards/rejected": 0.691601574420929, "rewards/accuracies": 1.0, "rewards/margins": 15.837499618530273, "logps/chosen": -243.39999389648438, "logps/rejected": -329.6000061035156, "logits/chosen": -0.552197277545929, "logits/rejected": 1.428125023841858, "nll_loss": 0.3490234315395355, "epoch": 2.6315789473684212, "step": 100}, {"eval_loss": 0.40234375, "eval_runtime": 1.3229, "eval_samples_per_second": 3.024, "eval_steps_per_second": 0.756, "eval_rewards/chosen": 15.375, "eval_rewards/rejected": 0.30078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.125, "eval_logps/chosen": -231.0, "eval_logps/rejected": -492.0, "eval_logits/chosen": -1.109375, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.40234375, "epoch": 2.6315789473684212, "step": 100}, {"loss": 0.312152099609375, "grad_norm": 0.793720059226471, "learning_rate": 1.70370868554659e-06, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.293988, "rewards/chosen": 16.125, "rewards/rejected": 1.4679687023162842, "rewards/accuracies": 1.0, "rewards/margins": 14.675000190734863, "logps/chosen": -223.8000030517578, "logps/rejected": -309.0, "logits/chosen": -0.35712891817092896, "logits/rejected": 1.049218773841858, "nll_loss": 0.3121093809604645, "epoch": 2.763157894736842, "step": 105}, {"loss": 0.30657806396484377, "grad_norm": 0.44121695697744323, "learning_rate": 3.380821129028489e-07, "memory(GiB)": 49.45, "train_speed(iter/s)": 0.294539, "rewards/chosen": 16.125, "rewards/rejected": 1.6531250476837158, "rewards/accuracies": 1.0, "rewards/margins": 14.487500190734863, "logps/chosen": -209.1999969482422, "logps/rejected": -315.0, "logits/chosen": -0.4349609315395355, "logits/rejected": 1.498437523841858, "nll_loss": 0.306640625, "epoch": 2.8947368421052633, "step": 110}, {"eval_loss": 0.40185546875, "eval_runtime": 1.2824, "eval_samples_per_second": 3.119, "eval_steps_per_second": 0.78, "eval_rewards/chosen": 15.375, "eval_rewards/rejected": 0.30078125, "eval_rewards/accuracies": 1.0, "eval_rewards/margins": 15.125, "eval_logps/chosen": -231.0, "eval_logps/rejected": -492.0, "eval_logits/chosen": -1.1171875, "eval_logits/rejected": 1.8125, "eval_nll_loss": 0.40234375, "epoch": 3.0, "step": 114}, {"train_runtime": 387.4342, "train_samples_per_second": 2.307, "train_steps_per_second": 0.294, "total_flos": 144718705983488.0, "train_loss": 0.5527549208256236, "epoch": 3.0, "step": 114}], "memory": 49.44921875} diff --git a/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs/events.out.tfevents.1739308100.kml-task-540432-record-10109969-prod-worker-0.29445.0 b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs/events.out.tfevents.1739308100.kml-task-540432-record-10109969-prod-worker-0.29445.0 new file mode 100644 index 0000000000000000000000000000000000000000..16893c739e0fa02e242df7eacdad7545ca2c737e --- /dev/null +++ b/deepseek-r1-7b_400_0.5_dpo_4096_rank8_epoch3_what/v0-20250211-210626/runs/events.out.tfevents.1739308100.kml-task-540432-record-10109969-prod-worker-0.29445.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d40a0f4323ae256ad02ffba7fc43495eaed51651bf88f936f5826794f63a5df +size 32205