diff --git a/checkpoints/002500/pretrained_model/config.json b/checkpoints/002500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/002500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/model.safetensors b/checkpoints/002500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72c22b4402573dd9d826c1c9d3ad73928ca7e257 --- /dev/null +++ b/checkpoints/002500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60e28f536bd73b409492e722fbf68c4b16f7fdc72b5f299affc789fc6c9a36f +size 906712520 diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor.json b/checkpoints/002500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor.json b/checkpoints/002500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/002500/pretrained_model/train_config.json b/checkpoints/002500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/002500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_param_groups.json b/checkpoints/002500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..4a8eb6995b136451c178c1d02a60c63986f7c065 --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.833888403159208e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_state.safetensors b/checkpoints/002500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..304844bd3011101a601700af24f9b5ab972f852d --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e758697c30b4f5b04c14077fc4e7c15625bea8c569513b0cc3267911e804369c +size 412659164 diff --git a/checkpoints/002500/training_state/rng_state.safetensors b/checkpoints/002500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e419d9c86e5b3c371e9cfc686dfcfd1b08327604 --- /dev/null +++ b/checkpoints/002500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aedb72c5a7a7b8c24b363a5b8ba851c9415352f25a519d3835a7955773f0abe +size 15708 diff --git a/checkpoints/002500/training_state/scheduler_state.json b/checkpoints/002500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2243c269694e95eb290978786e4c9526df1b7a6a --- /dev/null +++ b/checkpoints/002500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 2500, + "_step_count": 2501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.833888403159208e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/training_step.json b/checkpoints/002500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..d265f47a09c2910099ed59e197b57b34675d1ae0 --- /dev/null +++ b/checkpoints/002500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 2500 +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/config.json b/checkpoints/005000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/005000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/model.safetensors b/checkpoints/005000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..519aba1f4d22295039966736801f13cbdb78cd48 --- /dev/null +++ b/checkpoints/005000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97705246d93d6067232e5150d74ec85b8e7d50cbba9c1116c08943d24d4132f6 +size 906712520 diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor.json b/checkpoints/005000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor.json b/checkpoints/005000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/005000/pretrained_model/train_config.json b/checkpoints/005000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/005000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_param_groups.json b/checkpoints/005000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..da3ff6fe3c49bde113b5e51b8f07a79834a00a55 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.34687384344914e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_state.safetensors b/checkpoints/005000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8033d157e2567656cd34fa87f1377f208059dac7 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bad8e5efa15e58ede4c482aba544b0d0c4365a00824c75ed59cb2c7cc728a95 +size 412659164 diff --git a/checkpoints/005000/training_state/rng_state.safetensors b/checkpoints/005000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c1159fa649794f5c2f23c950b05fb79db3221490 --- /dev/null +++ b/checkpoints/005000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a69e1619d83b18f9c2e4017f24d786b7a133260c05cf2b88f48efeeef048f9 +size 15708 diff --git a/checkpoints/005000/training_state/scheduler_state.json b/checkpoints/005000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26e28372a197de2e43c4da4b379b2995d3c971c2 --- /dev/null +++ b/checkpoints/005000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 5000, + "_step_count": 5001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.34687384344914e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/training_step.json b/checkpoints/005000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..592449d3fc8b35c30c604eb1dabe60537e8224a0 --- /dev/null +++ b/checkpoints/005000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 5000 +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/config.json b/checkpoints/007500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/007500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/model.safetensors b/checkpoints/007500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..afadae30295117f321f9ed632f28b91f61fe99e6 --- /dev/null +++ b/checkpoints/007500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81401516f5ff9f13776080e43c17467ea569da28305cebe1d0be81d9e50f1eae +size 906712520 diff --git a/checkpoints/007500/pretrained_model/policy_postprocessor.json b/checkpoints/007500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/007500/pretrained_model/policy_preprocessor.json b/checkpoints/007500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/007500/pretrained_model/train_config.json b/checkpoints/007500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/007500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/007500/training_state/optimizer_param_groups.json b/checkpoints/007500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..43575e068ea049c658a0470ace2668ea091c6723 --- /dev/null +++ b/checkpoints/007500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 8.572145558284418e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/007500/training_state/optimizer_state.safetensors b/checkpoints/007500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba3c1fbe31146eb7854cc79098ba0bdb6ee29fe0 --- /dev/null +++ b/checkpoints/007500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1683d9a932b0e154020ae8589dccfec1f83296cc075603d38261effc968fef2 +size 412659164 diff --git a/checkpoints/007500/training_state/rng_state.safetensors b/checkpoints/007500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d7195b629bf6781abfc0dbd6618c54fa72703d2 --- /dev/null +++ b/checkpoints/007500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f57337ffebd188864d3991ce0e1278d5ecf1881ba2e3617b1f29bc27f93272f +size 15708 diff --git a/checkpoints/007500/training_state/scheduler_state.json b/checkpoints/007500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af1e2297bc2901f61499236a41155372547389a8 --- /dev/null +++ b/checkpoints/007500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 7500, + "_step_count": 7501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 8.572145558284418e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/007500/training_state/training_step.json b/checkpoints/007500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..119265efe00a465e1aea6ca140089eb7aa8795c0 --- /dev/null +++ b/checkpoints/007500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 7500 +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/config.json b/checkpoints/010000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/010000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/model.safetensors b/checkpoints/010000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..074a42179b944b0b7552fe17f81dfd925a45a682 --- /dev/null +++ b/checkpoints/010000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1ab731027826bbf19b686b862c78dd7aeefe88acb3788ee7c91c579c5464052 +size 906712520 diff --git a/checkpoints/010000/pretrained_model/policy_postprocessor.json b/checkpoints/010000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/010000/pretrained_model/policy_preprocessor.json b/checkpoints/010000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/010000/pretrained_model/train_config.json b/checkpoints/010000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/010000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/010000/training_state/optimizer_param_groups.json b/checkpoints/010000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee4c92efd08e25593632b371807c89fc1b8ecc2 --- /dev/null +++ b/checkpoints/010000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 7.5625e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/010000/training_state/optimizer_state.safetensors b/checkpoints/010000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..91de7a49175a921c14005d16f563f7a2ae4d712a --- /dev/null +++ b/checkpoints/010000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a22440f2b3498222fcf0fcf7eaec207496a6f7ee525b48b7c5acd846548e6a +size 412659164 diff --git a/checkpoints/010000/training_state/rng_state.safetensors b/checkpoints/010000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..665a6850657aee883b428330b0fcd1adcab39523 --- /dev/null +++ b/checkpoints/010000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989dd33d1f046cb3f3f3119202c13079fb3f8793edfce9ec87af3b30d74c1bb7 +size 15708 diff --git a/checkpoints/010000/training_state/scheduler_state.json b/checkpoints/010000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a6fabdcd1c2a4515b7818e55be9009a5f7ef1e --- /dev/null +++ b/checkpoints/010000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 10000, + "_step_count": 10001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 7.5625e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/010000/training_state/training_step.json b/checkpoints/010000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..7cb7c0986e9e7461ca851ce71e95d235ae3d2732 --- /dev/null +++ b/checkpoints/010000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 10000 +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/config.json b/checkpoints/012500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/012500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/model.safetensors b/checkpoints/012500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..407c93f34dd79b42856edfe2095f75bd8e0c5af4 --- /dev/null +++ b/checkpoints/012500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6164d899be016df948b851134e880353d6c084a4e9e224fa64c8fad2c8bf792b +size 906712520 diff --git a/checkpoints/012500/pretrained_model/policy_postprocessor.json b/checkpoints/012500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/012500/pretrained_model/policy_preprocessor.json b/checkpoints/012500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/012500/pretrained_model/train_config.json b/checkpoints/012500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/012500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/012500/training_state/optimizer_param_groups.json b/checkpoints/012500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..812a28183920482c0f8faf10ff2c929a68cffa8b --- /dev/null +++ b/checkpoints/012500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 6.38674284487479e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/012500/training_state/optimizer_state.safetensors b/checkpoints/012500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c88547cf39559618ccb8ece246dc2ab6a32be95 --- /dev/null +++ b/checkpoints/012500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fc0c17114c4871eef2dd403b5b0b13f57ffdb40c518bb788d86ed521e409891 +size 412659164 diff --git a/checkpoints/012500/training_state/rng_state.safetensors b/checkpoints/012500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ad2336910d767944e0a127bb045c56fbb709f34d --- /dev/null +++ b/checkpoints/012500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbb48bbb55d38a43095d9ae53813bc420e70e6c05cc3179da48560705815f8b8 +size 15708 diff --git a/checkpoints/012500/training_state/scheduler_state.json b/checkpoints/012500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0852d1dea3f53d37259a91d41d454bff2f58c4da --- /dev/null +++ b/checkpoints/012500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 12500, + "_step_count": 12501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 6.38674284487479e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/012500/training_state/training_step.json b/checkpoints/012500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..6d00d1fb30e9fa29506662caa05a99b86254c107 --- /dev/null +++ b/checkpoints/012500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 12500 +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/config.json b/checkpoints/015000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/015000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/model.safetensors b/checkpoints/015000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee6962a10e3070ac8fe90e0a0df58f7866cc1293 --- /dev/null +++ b/checkpoints/015000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b0865fb3218c33cf8160c86ebc7e40e1bacdd82a6a70eae9c0dc593c332fbf +size 906712520 diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor.json b/checkpoints/015000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor.json b/checkpoints/015000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/015000/pretrained_model/train_config.json b/checkpoints/015000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/015000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_param_groups.json b/checkpoints/015000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..9277f6533d9b54979939a62759a6fbdec0e5ac6c --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 5.125e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_state.safetensors b/checkpoints/015000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eae945348660da20c931f74e1357557316ecfb94 --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38e166a5265d36ab5b1f86177a962f9afd5e8eefa1a29d9cd6a1116eb2b6514c +size 412659164 diff --git a/checkpoints/015000/training_state/rng_state.safetensors b/checkpoints/015000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec1523561fceaf07991e780e2bc5f21c15ba2ddd --- /dev/null +++ b/checkpoints/015000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:698a47edbb7addacb96b850f5a1a221bcacb2114ebdd34534947075c9649852c +size 15708 diff --git a/checkpoints/015000/training_state/scheduler_state.json b/checkpoints/015000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba6b6927e93a4428e4458c8496e3590dcbbac247 --- /dev/null +++ b/checkpoints/015000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 15000, + "_step_count": 15001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 5.125e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/training_step.json b/checkpoints/015000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..5cec056c8ba7f3c2e865a0f73ae59975a0503067 --- /dev/null +++ b/checkpoints/015000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 15000 +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/config.json b/checkpoints/017500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/017500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/model.safetensors b/checkpoints/017500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5547f9ea442f9e984da5ae87a8128ae5909338ec --- /dev/null +++ b/checkpoints/017500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25087bb3d685c0be5f325e2ee6840876d66029f590e4f793576c71e02c8954a9 +size 906712520 diff --git a/checkpoints/017500/pretrained_model/policy_postprocessor.json b/checkpoints/017500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/017500/pretrained_model/policy_preprocessor.json b/checkpoints/017500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/017500/pretrained_model/train_config.json b/checkpoints/017500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/017500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/017500/training_state/optimizer_param_groups.json b/checkpoints/017500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc2fc45607e594353d8008c7e1d86c540c6101b --- /dev/null +++ b/checkpoints/017500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 3.863257155125212e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/017500/training_state/optimizer_state.safetensors b/checkpoints/017500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5d4388db6c2f7f808deee148f35d5779658cc18 --- /dev/null +++ b/checkpoints/017500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b9a3e0d8ab200aabebadc84fbd68a6c61997d7cbb8747d7c0cdc801d73c078e +size 412659164 diff --git a/checkpoints/017500/training_state/rng_state.safetensors b/checkpoints/017500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d178f6ef69a59d95ce310637a1803c1fc613b0e5 --- /dev/null +++ b/checkpoints/017500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670fc22cdaa7250f175a35ec2b789f35a4394f4a4d57d92da16f08ebb0e42b36 +size 15708 diff --git a/checkpoints/017500/training_state/scheduler_state.json b/checkpoints/017500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4df9452e4c2557a375083bd9f206dea2f77ab676 --- /dev/null +++ b/checkpoints/017500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 17500, + "_step_count": 17501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 3.863257155125212e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/017500/training_state/training_step.json b/checkpoints/017500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..da8b274dab5ff8a957dfac52878d5b5a54a1a31d --- /dev/null +++ b/checkpoints/017500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 17500 +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/config.json b/checkpoints/020000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/020000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/model.safetensors b/checkpoints/020000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a693278b554d09f3a25f7ed6d5646a361709f2f0 --- /dev/null +++ b/checkpoints/020000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3c3fff43e2481ec97032d889b2ba1e9116caa9cca44323b5c745cde4b1e0be1 +size 906712520 diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor.json b/checkpoints/020000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor.json b/checkpoints/020000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/020000/pretrained_model/train_config.json b/checkpoints/020000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/020000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_param_groups.json b/checkpoints/020000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..fd995dd00b618398663d24c98ad932310a8be4b7 --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.6875000000000013e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_state.safetensors b/checkpoints/020000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0cd1dd746dc87b2db19e91f2081842c7dbb8c59 --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f5350bf4bc91d174a43d718718f450ae1c1f137cc83d10f124d82539951b50 +size 412659164 diff --git a/checkpoints/020000/training_state/rng_state.safetensors b/checkpoints/020000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3eb95b678bf096f8e0a3f8f9e5c7e7de9ea235de --- /dev/null +++ b/checkpoints/020000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ac7c56a234e35a2e1e7b15e39c75c02a838cd8c49f55b1f6dde2545d52a97e +size 15708 diff --git a/checkpoints/020000/training_state/scheduler_state.json b/checkpoints/020000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bc8702f14e2b5c016ee125c7113babe553dd2e53 --- /dev/null +++ b/checkpoints/020000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 20000, + "_step_count": 20001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.6875000000000013e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/training_step.json b/checkpoints/020000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9bb47026c5d5237ca6fc5dbff6020dd122ea05 --- /dev/null +++ b/checkpoints/020000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 20000 +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/config.json b/checkpoints/022500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/022500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/model.safetensors b/checkpoints/022500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c215731871ffcab4103a49cc30d47b0c4ec5503e --- /dev/null +++ b/checkpoints/022500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d330eb6c2c820aa4932cf425b3b7b742e9d0abd33e981677f7a9c026d67ca21 +size 906712520 diff --git a/checkpoints/022500/pretrained_model/policy_postprocessor.json b/checkpoints/022500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/022500/pretrained_model/policy_preprocessor.json b/checkpoints/022500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/022500/pretrained_model/train_config.json b/checkpoints/022500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/022500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/022500/training_state/optimizer_param_groups.json b/checkpoints/022500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..ac647e3325da800c05c8d3cbe177c91d11973a5f --- /dev/null +++ b/checkpoints/022500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.677854441715581e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/022500/training_state/optimizer_state.safetensors b/checkpoints/022500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..94894ad34aa3463bf84f554883affd94f6f5f659 --- /dev/null +++ b/checkpoints/022500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:573b104e9ae035842423c1dc107c60ad876197612f55a5f661d6a4e8a3a66dd9 +size 412659164 diff --git a/checkpoints/022500/training_state/rng_state.safetensors b/checkpoints/022500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea70e636152e81b6d1a9b0d5b425524ddd9e0eb1 --- /dev/null +++ b/checkpoints/022500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f6bcf586b76619eeb1781d00c9e09c527dffddb42e2388753729ee93d3db58 +size 15708 diff --git a/checkpoints/022500/training_state/scheduler_state.json b/checkpoints/022500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3fbe00aac481debedc95e57cd5d2a2f64a69c057 --- /dev/null +++ b/checkpoints/022500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 22500, + "_step_count": 22501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.677854441715581e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/022500/training_state/training_step.json b/checkpoints/022500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..f806cbfd68a16a907ec083e2c16fb225eb569102 --- /dev/null +++ b/checkpoints/022500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 22500 +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/config.json b/checkpoints/025000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/025000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/model.safetensors b/checkpoints/025000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3a8b13941c1afd61ba7e6883e648e9b81db71dc --- /dev/null +++ b/checkpoints/025000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ea7b4f287d2890000334aa188862ab4dcda87ca6918869d9fda7ca35a1e0bc +size 906712520 diff --git a/checkpoints/025000/pretrained_model/policy_postprocessor.json b/checkpoints/025000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/025000/pretrained_model/policy_preprocessor.json b/checkpoints/025000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/025000/pretrained_model/train_config.json b/checkpoints/025000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/025000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/025000/training_state/optimizer_param_groups.json b/checkpoints/025000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..e3ca9c8087502a565c9fab9b9cba038bf4a0fa88 --- /dev/null +++ b/checkpoints/025000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.031261565508626e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/025000/training_state/optimizer_state.safetensors b/checkpoints/025000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ab551993b20df86367fca5bbdc9bdb1d75668af --- /dev/null +++ b/checkpoints/025000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6df827940d63276716277d58a251e5281126a78ab7d198ae0d57e50b4ddd73 +size 412659164 diff --git a/checkpoints/025000/training_state/rng_state.safetensors b/checkpoints/025000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..647ec84c1fc1e5f4cdee02981cbcefd7a5f44f18 --- /dev/null +++ b/checkpoints/025000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd650253ea7d55765bdfa20d9f112d4c14fc5cdbd93c6365948f631d5712e626 +size 15708 diff --git a/checkpoints/025000/training_state/scheduler_state.json b/checkpoints/025000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cbcf8cfe42f784efa28d35a6863ba0dc2c596011 --- /dev/null +++ b/checkpoints/025000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 25000, + "_step_count": 25001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.031261565508626e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/025000/training_state/training_step.json b/checkpoints/025000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..ac6903ae05e7bbaec1e2da710fdbe91d8dc3f37d --- /dev/null +++ b/checkpoints/025000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 25000 +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/config.json b/checkpoints/027500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/027500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/model.safetensors b/checkpoints/027500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f32a9d8f6b1c4059e38f547e7c240705ec975c5d --- /dev/null +++ b/checkpoints/027500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0b1aec400a5af1bb2a4b5cdd09d0e320a5102321f5080ea419d38d72455e77e +size 906712520 diff --git a/checkpoints/027500/pretrained_model/policy_postprocessor.json b/checkpoints/027500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/027500/pretrained_model/policy_preprocessor.json b/checkpoints/027500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/027500/pretrained_model/train_config.json b/checkpoints/027500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/027500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/027500/training_state/optimizer_param_groups.json b/checkpoints/027500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..d64750c4b9541926eff51c3d00b2e53cfa37c25f --- /dev/null +++ b/checkpoints/027500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 4.161115968407925e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/027500/training_state/optimizer_state.safetensors b/checkpoints/027500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b855a9dae2208ee85855a731f7e8f3ae479d9d0d --- /dev/null +++ b/checkpoints/027500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9dff38601d8a3a5f9bee25b642f24c98057c55df25a67a6fffc641c592ff33 +size 412659164 diff --git a/checkpoints/027500/training_state/rng_state.safetensors b/checkpoints/027500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..739caa1c59be5f7e00f693aa959ff56630c042d3 --- /dev/null +++ b/checkpoints/027500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc44b6cd83260df27916b8def922322988d255bfbbd6e0d61c3291795f2e1d7e +size 15708 diff --git a/checkpoints/027500/training_state/scheduler_state.json b/checkpoints/027500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c081719f3552fd72683cc831570dc198e116d92e --- /dev/null +++ b/checkpoints/027500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 27500, + "_step_count": 27501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 4.161115968407925e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/027500/training_state/training_step.json b/checkpoints/027500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4b7f79233b940b70aec36bd79954b47a0e826d --- /dev/null +++ b/checkpoints/027500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 27500 +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/config.json b/checkpoints/030000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87f94c4dd199506631911828947f1bb225c026d7 --- /dev/null +++ b/checkpoints/030000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/model.safetensors b/checkpoints/030000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..128532e25abd1edbed955edbfa84e298a271d6ad --- /dev/null +++ b/checkpoints/030000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:def7fd51ae688d923422840dc03b2d278f696964b5ef85d409c5ddd9c83777d4 +size 906712520 diff --git a/checkpoints/030000/pretrained_model/policy_postprocessor.json b/checkpoints/030000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/030000/pretrained_model/policy_preprocessor.json b/checkpoints/030000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92f1bab27857fb364f324d0aa0e5166b8c2b1faa --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c809ea4a32d649fa36ef9f13494ef6ac739d7a0f095474b55d41748a9fd36bdf +size 5840 diff --git a/checkpoints/030000/pretrained_model/train_config.json b/checkpoints/030000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..516d786dd37d311d76bed34d6e3ffb93ab5b8142 --- /dev/null +++ b/checkpoints/030000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_90_chengming_promptaug_v1", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "job_name": "eval2_90_promptaug_v1_randprompt_smolvla_runA_30k_wandb", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 4, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "5i3050dl", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/030000/training_state/optimizer_param_groups.json b/checkpoints/030000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..432e25d9d9051a91f53e76a5753722c07eba5dab --- /dev/null +++ b/checkpoints/030000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.5e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/030000/training_state/optimizer_state.safetensors b/checkpoints/030000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f400462626edaf4c899ce847ae156b55397aa184 --- /dev/null +++ b/checkpoints/030000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c0dae8dae0902c974f9e79b58ec51a2a3cc3942e7a47d2c1b4fde86f9895d71 +size 412659164 diff --git a/checkpoints/030000/training_state/rng_state.safetensors b/checkpoints/030000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29edc5ab7306dbeb7924b2b78ab18bbe313abaac --- /dev/null +++ b/checkpoints/030000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70b7714a295ae4c342087ff70b48b6dbe488243e606c0637253c96662fb84e98 +size 15708 diff --git a/checkpoints/030000/training_state/scheduler_state.json b/checkpoints/030000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a09e9ada2b0bc0b2f519fcd89834a9e8725ec165 --- /dev/null +++ b/checkpoints/030000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 30000, + "_step_count": 30001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.5e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/030000/training_state/training_step.json b/checkpoints/030000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..f4945f660f45b332883dccfccf18d8b8815d916a --- /dev/null +++ b/checkpoints/030000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 30000 +} \ No newline at end of file