| dataset: s3://scale-ml/users/niklas/swe-agent/train/qwen-2000imitation-student/with_pr/masked/961dagger-2000imitation |
| debug: false |
| eval_at_step_zero: null |
| eval_case_report: false |
| eval_case_report_only_splits: [] |
| eval_mode: false |
| eval_steps: 0.2 |
| eval_strategy: steps |
| experimental: |
| activation_checkpointing: true |
| enable_context_parallel: false |
| mask_input_ids_by_flag: true |
| pad_to_max_length: false |
| pipeline_parallel_reshard: false |
| pipeline_parallel_schedule: gpipe |
| pipeline_parallel_size: 1 |
| pp_last_stage_offset: 0 |
| pp_share_train_eval_schedule: true |
| torch_compile: false |
| hyperparams: |
| adam_beta1: 0.9 |
| adam_beta2: 0.999 |
| adam_epsilon: 1.0e-08 |
| adam_weight_decay: null |
| constant_pack: false |
| eval_num_rollouts_per_prompt: 1 |
| gradient_accumulation_steps: 1 |
| learning_rate: 5.0e-05 |
| loss_form: null |
| lr_scheduler_kwargs: null |
| lr_scheduler_type: cosine |
| mask_instruct: true |
| max_grad_norm: 1.0 |
| max_length: 32768 |
| num_rollouts_per_prompt: 1 |
| num_train_epochs: 3 |
| num_train_steps: -1 |
| online: false |
| optimizer: adam |
| per_device_eval_batch_size: 1 |
| per_device_micro_batch_size: 1 |
| per_device_train_batch_size: 1 |
| sleep_level: 2 |
| warmup_ratio: 0.05 |
| weight_decay: 0.01 |
| local_output_path: /mnt/nvme |
| logging_rollouts: 0 |
| logging_steps: 1 |
| model_squad: |
| lm: |
| activation_checkpointing: true |
| model_path: s3://scale-ml/users/niklas/models/smith-claude-expert/2000imitation-lr5e-5-batch16/checkpoints/checkpoint-375/ |
| parallel_state: |
| cp_mesh: null |
| cp_size: 1 |
| device_type: cuda |
| dp_size: 16 |
| pp_size: 1 |
| world_mesh: !!python/object:torch.distributed.device_mesh.DeviceMesh |
| _coordinate_on_dim: |
| - 0 |
| _dim_group_infos: |
| - !!python/tuple |
| - ptd:0 |
| - - 0 |
| - 1 |
| - 2 |
| - 3 |
| - 4 |
| - 5 |
| - 6 |
| - 7 |
| - 8 |
| - 9 |
| - 10 |
| - 11 |
| - 12 |
| - 13 |
| - 14 |
| - 15 |
| - '0' |
| _flatten_mesh_list: !!python/tuple |
| - 0 |
| - 1 |
| - 2 |
| - 3 |
| - 4 |
| - 5 |
| - 6 |
| - 7 |
| - 8 |
| - 9 |
| - 10 |
| - 11 |
| - 12 |
| - 13 |
| - 14 |
| - 15 |
| _hash: -8305722318908533129 |
| _thread_id: null |
| device_type: cuda |
| mesh: !!python/object/apply:torch._utils._rebuild_tensor_v2 |
| - !!python/object/apply:torch.storage._load_from_bytes |
| - !!binary | |
| gAKKCmz8nEb5IGqoUBkugAJN6QMugAJ9cQAoWBAAAABwcm90b2NvbF92ZXJzaW9ucQFN6QNYDQAA |
| AGxpdHRsZV9lbmRpYW5xAohYCgAAAHR5cGVfc2l6ZXNxA31xBChYBQAAAHNob3J0cQVLAlgDAAAA |
| aW50cQZLBFgEAAAAbG9uZ3EHSwR1dS6AAihYBwAAAHN0b3JhZ2VxAGN0b3JjaApJbnRTdG9yYWdl |
| CnEBWA8AAAAxMDI1MzIxMjY3NjY4MDBxAlgDAAAAY3B1cQNLEE50cQRRLoACXXEAWA8AAAAxMDI1 |
| MzIxMjY3NjY4MDBxAWEuEAAAAAAAAAAAAAAAAQAAAAIAAAADAAAABAAAAAUAAAAGAAAABwAAAAgA |
| AAAJAAAACgAAAAsAAAAMAAAADQAAAA4AAAAPAAAA |
| - 0 |
| - !!python/tuple |
| - 16 |
| - !!python/tuple |
| - 1 |
| - false |
| - !!python/object/apply:collections.OrderedDict |
| - [] |
| mesh_dim_names: !!python/tuple |
| - dp |
| world_size: 16 |
| torch_compile: false |
| use_fsdp2: true |
| use_scale_llama: false |
| processing_interface: |
| class_name: BaseProcessingInterface |
| module_path: trainers.processing_interface |
| remote_object: {} |
| resume: false |
| s3_output_path: s3://scale-ml/users/niklas/models/qwen-2000imitation-student/with_pr/masked/961dagger-2000imitation |
| save_at_step_zero: null |
| save_final_model: true |
| save_hf: true |
| save_lr_scheduler: true |
| save_optimizer: true |
| save_s3_async: true |
| save_steps: 0.2 |
| save_strategy: epoch |
| task: sft |
| use_device_mesh: true |
| use_fsdp2: true |
| use_scale_llama: false |
| wandb: |
| entity: gen-ai |
| name: qwen2.5-961dagger-2000imitation-with_pr-masked-lr5e-5-batch16 |
| project: agent-rlxf |
| wandb_host: https://scaleai.wandb.io/ |
| wandb_key_name: NIKLAS_WANDB_API_KEY |
| wandb_secretsmanager_location: team/GENAIML/secret-store-key |
|
|