diff --git a/checkpoints/000500/pretrained_model/config.json b/checkpoints/000500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/000500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/000500/pretrained_model/model.safetensors b/checkpoints/000500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5863e3340c9365d6c31f5b2c0033d5741ba21b9c --- /dev/null +++ b/checkpoints/000500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5368a26e225dcbf8f81c0f8e1ba9be8e96517c82cd6d2a149a8143a8dc3a5e58 +size 906712520 diff --git a/checkpoints/000500/pretrained_model/policy_postprocessor.json b/checkpoints/000500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/000500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/000500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/000500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/000500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/000500/pretrained_model/policy_preprocessor.json b/checkpoints/000500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/000500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/000500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/000500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/000500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/000500/pretrained_model/train_config.json b/checkpoints/000500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/000500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/000500/training_state/optimizer_param_groups.json b/checkpoints/000500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..60f9dee48a8f25874671e1d728e6afa9a406218c --- /dev/null +++ b/checkpoints/000500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.970185098002935e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/000500/training_state/optimizer_state.safetensors b/checkpoints/000500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..56c0a2977a9db16b23e4f16e2c34679a273b8985 --- /dev/null +++ b/checkpoints/000500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c0206d802e426e478a6e7b72e548494033928fc4bd017757b5594fb46bf8a54 +size 412659164 diff --git a/checkpoints/000500/training_state/rng_state.safetensors b/checkpoints/000500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89135c3aaad4664782a56b577827e71ac59bde04 --- /dev/null +++ b/checkpoints/000500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:760905c6f38fd9f3e1de5a10822292fcb05ae9cf1dfe6df5b4619fc532e5911b +size 15708 diff --git a/checkpoints/000500/training_state/scheduler_state.json b/checkpoints/000500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1e681b8b7b23a95067ba609dbc3e9c9dd7fe19 --- /dev/null +++ b/checkpoints/000500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 500, + "_step_count": 501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.970185098002935e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/000500/training_state/training_step.json b/checkpoints/000500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..8df625539d7e10ddfbd1e2ac0daf2fdd64ff5c4b --- /dev/null +++ b/checkpoints/000500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 500 +} \ No newline at end of file diff --git a/checkpoints/001000/pretrained_model/config.json b/checkpoints/001000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/001000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/001000/pretrained_model/model.safetensors b/checkpoints/001000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..813cd9c590bf93f0959196671322cd43991277e8 --- /dev/null +++ b/checkpoints/001000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6237b015ad8db950b86fef7b08e7c1c454cd470e759f7ab8ab9c43c61097e4dd +size 906712520 diff --git a/checkpoints/001000/pretrained_model/policy_postprocessor.json b/checkpoints/001000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/001000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/001000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/001000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/001000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/001000/pretrained_model/policy_preprocessor.json b/checkpoints/001000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/001000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/001000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/001000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/001000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/001000/pretrained_model/train_config.json b/checkpoints/001000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/001000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/001000/training_state/optimizer_param_groups.json b/checkpoints/001000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..d85c9724ce3e16e2b8ce774699945585b569f4a8 --- /dev/null +++ b/checkpoints/001000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.882772228311384e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/001000/training_state/optimizer_state.safetensors b/checkpoints/001000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..319d3a9f41df7e249c34e44ef83d9cffe0b868f6 --- /dev/null +++ b/checkpoints/001000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24af6e6a1882f62bdf78814a346158350105f2c58a0812cd6382587cc50fff2 +size 412659164 diff --git a/checkpoints/001000/training_state/rng_state.safetensors b/checkpoints/001000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb75e270b25b19402b7ead004a6af5e7aff63d07 --- /dev/null +++ b/checkpoints/001000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f4c1dc1340cf93f4bd71b873fcc867eb5f113760e70346ce74179c94f3d3cb +size 15708 diff --git a/checkpoints/001000/training_state/scheduler_state.json b/checkpoints/001000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eacc9f6fe8644072798e70dfceeb328e96e2f234 --- /dev/null +++ b/checkpoints/001000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 1000, + "_step_count": 1001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.882772228311384e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/001000/training_state/training_step.json b/checkpoints/001000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..d98c94f5b78238bf495ac68b9f9fb446cfac5c07 --- /dev/null +++ b/checkpoints/001000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 1000 +} \ No newline at end of file diff --git a/checkpoints/001500/pretrained_model/config.json b/checkpoints/001500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/001500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/001500/pretrained_model/model.safetensors b/checkpoints/001500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6b4524e5c8ab37b7ed30fe0639d481bc66f2c5c9 --- /dev/null +++ b/checkpoints/001500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0086a706d81607edc657a6db31f2c2edfb6e30c8dca7b44ad54034dd33156759 +size 906712520 diff --git a/checkpoints/001500/pretrained_model/policy_postprocessor.json b/checkpoints/001500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/001500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/001500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/001500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/001500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/001500/pretrained_model/policy_preprocessor.json b/checkpoints/001500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/001500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/001500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/001500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/001500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/001500/pretrained_model/train_config.json b/checkpoints/001500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/001500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/001500/training_state/optimizer_param_groups.json b/checkpoints/001500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..6e2d301e48188a8826ec3423e952d759b320b54d --- /dev/null +++ b/checkpoints/001500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.743718433538229e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/001500/training_state/optimizer_state.safetensors b/checkpoints/001500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..814f1b6790f06d77ea8c543272804764f159f466 --- /dev/null +++ b/checkpoints/001500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822bee79d6e7b38e305f8bb83981d3051c727dbbecf7b8989a42adacf502468f +size 412659164 diff --git a/checkpoints/001500/training_state/rng_state.safetensors b/checkpoints/001500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9283e66287e43e1074d4287228983e783bb389e3 --- /dev/null +++ b/checkpoints/001500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d69c1564e8bddd338698ddb63c6bfdeaa89bf5f1b4c541337e05f037469d3087 +size 15708 diff --git a/checkpoints/001500/training_state/scheduler_state.json b/checkpoints/001500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4d794225cec49e927486382ebe55cb577c95e617 --- /dev/null +++ b/checkpoints/001500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 1500, + "_step_count": 1501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.743718433538229e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/001500/training_state/training_step.json b/checkpoints/001500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..332c190c78659d9403699dab54afdd72e41d21b8 --- /dev/null +++ b/checkpoints/001500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 1500 +} \ No newline at end of file diff --git a/checkpoints/002000/pretrained_model/config.json b/checkpoints/002000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/002000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/002000/pretrained_model/model.safetensors b/checkpoints/002000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5248c29924a71a5af7c13624d0f38932d93554f3 --- /dev/null +++ b/checkpoints/002000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f348969b17f51f8dc78323b1480e4529b99cd1cf65f6fe5d68c2a34f8cff398d +size 906712520 diff --git a/checkpoints/002000/pretrained_model/policy_postprocessor.json b/checkpoints/002000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/002000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/002000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/002000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002000/pretrained_model/policy_preprocessor.json b/checkpoints/002000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/002000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/002000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/002000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002000/pretrained_model/train_config.json b/checkpoints/002000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/002000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/002000/training_state/optimizer_param_groups.json b/checkpoints/002000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..0d447ef6535d7d4b27c33d49e38a93248c409bf5 --- /dev/null +++ b/checkpoints/002000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.5625e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/002000/training_state/optimizer_state.safetensors b/checkpoints/002000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d82c9810d5e160c506253de74d33bc7c26e022e --- /dev/null +++ b/checkpoints/002000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2054cbe64e0f3d5a449afe2fe11c7722c61174d6c823b20c83c2054e813ad0c6 +size 412659164 diff --git a/checkpoints/002000/training_state/rng_state.safetensors b/checkpoints/002000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5468adcb1ee996d10373806af40dd325fbec4b2 --- /dev/null +++ b/checkpoints/002000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ebd0fcd2e1a9f564357d1116b488498027ee590702fd381810e4ce04eb480c9 +size 15708 diff --git a/checkpoints/002000/training_state/scheduler_state.json b/checkpoints/002000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d4d20f171311928be23c17d3e6fc0c009d49fa2 --- /dev/null +++ b/checkpoints/002000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 2000, + "_step_count": 2001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.5625e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/002000/training_state/training_step.json b/checkpoints/002000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..39090bbb986edb821e1602990d19357dcdb5d2ae --- /dev/null +++ b/checkpoints/002000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 2000 +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/config.json b/checkpoints/002500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/002500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/model.safetensors b/checkpoints/002500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06e21fb15defc4ca98a297d6ccb4ed5624b1652c --- /dev/null +++ b/checkpoints/002500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106b1989438317d3164d6af3a0613de6f068c5cdd586e85a30f8da8b6e36234d +size 906712520 diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor.json b/checkpoints/002500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor.json b/checkpoints/002500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002500/pretrained_model/train_config.json b/checkpoints/002500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/002500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_param_groups.json b/checkpoints/002500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..3775ffe662b28f6ac7493f1bfc8dbeeb138a8574 --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.3514666644647059e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_state.safetensors b/checkpoints/002500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c7b255b27386ed19111ff4d5d5b0d6df2756029 --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1051299de52e47d89168e1a78d727cd003c5b68294605798472214836872dbf +size 412659164 diff --git a/checkpoints/002500/training_state/rng_state.safetensors b/checkpoints/002500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b1f6e22a53422bd537183423431c463ca167c48 --- /dev/null +++ b/checkpoints/002500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ec205ea14eda30f3c9ef1b27bed8dedd89cde1579a466dd88c5ce1b82ac404 +size 15708 diff --git a/checkpoints/002500/training_state/scheduler_state.json b/checkpoints/002500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..10459612d75f531af040401469dd0b66ebe0c730 --- /dev/null +++ b/checkpoints/002500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 2500, + "_step_count": 2501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.3514666644647059e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/training_step.json b/checkpoints/002500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..d265f47a09c2910099ed59e197b57b34675d1ae0 --- /dev/null +++ b/checkpoints/002500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 2500 +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/config.json b/checkpoints/003000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/003000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/model.safetensors b/checkpoints/003000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8e33ef71c6b17e0821158aaf4a9b13a469cb5a3d --- /dev/null +++ b/checkpoints/003000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4919c1a30db992010e56bbd15926ab065066d9cc6feb66a17ed251b6a9408369 +size 906712520 diff --git a/checkpoints/003000/pretrained_model/policy_postprocessor.json b/checkpoints/003000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/003000/pretrained_model/policy_preprocessor.json b/checkpoints/003000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/003000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/003000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/003000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/003000/pretrained_model/train_config.json b/checkpoints/003000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/003000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/003000/training_state/optimizer_param_groups.json b/checkpoints/003000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..f7882456954607effce713f15371b202d989c9b8 --- /dev/null +++ b/checkpoints/003000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.125e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/003000/training_state/optimizer_state.safetensors b/checkpoints/003000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4596a517a1153f7d8b765ceb8a3706f2c75c2a95 --- /dev/null +++ b/checkpoints/003000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b72d3b92abfbd05db8b2c4b3c237da3eb15e80d0f4d8c4d22cf7dbc1de230f +size 412659164 diff --git a/checkpoints/003000/training_state/rng_state.safetensors b/checkpoints/003000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0379a7bf7a8bf778bd51371cd2b4198ce48c6ec1 --- /dev/null +++ b/checkpoints/003000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f196ed80b8f02bb1fced43cdcf6365ca2976b0764081915bc28cfcb577912b9a +size 15708 diff --git a/checkpoints/003000/training_state/scheduler_state.json b/checkpoints/003000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b961e951c7d082765484adebc45022dc99afa222 --- /dev/null +++ b/checkpoints/003000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 3000, + "_step_count": 3001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.125e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/003000/training_state/training_step.json b/checkpoints/003000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a44127b7dfea653fd776d529fa83c55d32081c --- /dev/null +++ b/checkpoints/003000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 3000 +} \ No newline at end of file diff --git a/checkpoints/003500/pretrained_model/config.json b/checkpoints/003500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/003500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/003500/pretrained_model/model.safetensors b/checkpoints/003500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cf56f3c8815c39ab4b3777f3b26398cf53dbdfd4 --- /dev/null +++ b/checkpoints/003500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc44f16d23c62f103745801241cffb70a8a0f91acee6a1bb67a96e3f05a4771 +size 906712520 diff --git a/checkpoints/003500/pretrained_model/policy_postprocessor.json b/checkpoints/003500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/003500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/003500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/003500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/003500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/003500/pretrained_model/policy_preprocessor.json b/checkpoints/003500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/003500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/003500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/003500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/003500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/003500/pretrained_model/train_config.json b/checkpoints/003500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/003500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/003500/training_state/optimizer_param_groups.json b/checkpoints/003500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..462ae88fba2503c2cf50c6390bfaeb7192df912d --- /dev/null +++ b/checkpoints/003500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 8.985333355352946e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/003500/training_state/optimizer_state.safetensors b/checkpoints/003500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a52ce6e6721536ee325b8c82586263fd6cd9d6bf --- /dev/null +++ b/checkpoints/003500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c444b60d0644ff8dcc4c4811711f99828c7fd514ae00e0422e5c35522d534955 +size 412659164 diff --git a/checkpoints/003500/training_state/rng_state.safetensors b/checkpoints/003500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b388436fd5aee6487f7ef44b3dd20c02fe995f6 --- /dev/null +++ b/checkpoints/003500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298968a5941542095fe693891decc6a94b8d40c6a0c92f11a794079fc701deb4 +size 15708 diff --git a/checkpoints/003500/training_state/scheduler_state.json b/checkpoints/003500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ef0b8d7e8338a9d67f01ed564d408dd22234079 --- /dev/null +++ b/checkpoints/003500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 3500, + "_step_count": 3501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 8.985333355352946e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/003500/training_state/training_step.json b/checkpoints/003500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..ae7ae4f2823ad80bcbd73e17fec39ab540bebe83 --- /dev/null +++ b/checkpoints/003500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 3500 +} \ No newline at end of file diff --git a/checkpoints/004000/pretrained_model/config.json b/checkpoints/004000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/004000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/004000/pretrained_model/model.safetensors b/checkpoints/004000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8770a54408f30a22348f862d9ad73e81ec4d855e --- /dev/null +++ b/checkpoints/004000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6e6576fcaa2231890b39cc5ded78f66e6d727326e4761756213ba33f5dae5a +size 906712520 diff --git a/checkpoints/004000/pretrained_model/policy_postprocessor.json b/checkpoints/004000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/004000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/004000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/004000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/004000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/004000/pretrained_model/policy_preprocessor.json b/checkpoints/004000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/004000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/004000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/004000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/004000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/004000/pretrained_model/train_config.json b/checkpoints/004000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/004000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/004000/training_state/optimizer_param_groups.json b/checkpoints/004000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..ca19b86e0e7aa6e10a038f454ea2bb963cf7e286 --- /dev/null +++ b/checkpoints/004000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 6.875000000000003e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/004000/training_state/optimizer_state.safetensors b/checkpoints/004000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..661d2b81b3775280262c063c8a7c1ac6eaeb566e --- /dev/null +++ b/checkpoints/004000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cebac996030addf5ac1478b8cf75199c40d9e7320d012b09e10184f079f064c +size 412659164 diff --git a/checkpoints/004000/training_state/rng_state.safetensors b/checkpoints/004000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..76b2b958764a286f7ae76978b2919091bc9455f9 --- /dev/null +++ b/checkpoints/004000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6456966bdf190d3285ecfa87d751ec5212e71cc1be41d4fd6df9f313b71e791d +size 15708 diff --git a/checkpoints/004000/training_state/scheduler_state.json b/checkpoints/004000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7d58db3e5606bd88919f85de99dafaf738d9d939 --- /dev/null +++ b/checkpoints/004000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 4000, + "_step_count": 4001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 6.875000000000003e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/004000/training_state/training_step.json b/checkpoints/004000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..75d870521c191f77dd9eaa4d83486eab6e768f69 --- /dev/null +++ b/checkpoints/004000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 4000 +} \ No newline at end of file diff --git a/checkpoints/004500/pretrained_model/config.json b/checkpoints/004500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/004500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/004500/pretrained_model/model.safetensors b/checkpoints/004500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca5baf4ea9d18c13370e4311735d3e0e29709a3e --- /dev/null +++ b/checkpoints/004500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba10f5398a876d613027e5d17c1b6bbb9772cb79fd4bfa6dc5cd64bbfc9a75f +size 906712520 diff --git a/checkpoints/004500/pretrained_model/policy_postprocessor.json b/checkpoints/004500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/004500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/004500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/004500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/004500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/004500/pretrained_model/policy_preprocessor.json b/checkpoints/004500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/004500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/004500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/004500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/004500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/004500/pretrained_model/train_config.json b/checkpoints/004500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/004500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/004500/training_state/optimizer_param_groups.json b/checkpoints/004500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..20b9f8f1bf41f05274119475116630a964d8fa74 --- /dev/null +++ b/checkpoints/004500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 5.062815664617711e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/004500/training_state/optimizer_state.safetensors b/checkpoints/004500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a2efc1f2cc085bad5751fa18b602d7c2d056064 --- /dev/null +++ b/checkpoints/004500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed49185472a3efd9acf2169526c6e91194712d5ecdcb6f5c4cd7a748faf374bd +size 412659164 diff --git a/checkpoints/004500/training_state/rng_state.safetensors b/checkpoints/004500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44c0fe135c719e266ecb8777a8c465353bd47ef4 --- /dev/null +++ b/checkpoints/004500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6913bd46249d51571ed505c7e395576e29507ec29d3dd7b3ca6258563114df65 +size 15708 diff --git a/checkpoints/004500/training_state/scheduler_state.json b/checkpoints/004500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a90fc9686b75344c7547e4f51817a9875405de49 --- /dev/null +++ b/checkpoints/004500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 4500, + "_step_count": 4501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 5.062815664617711e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/004500/training_state/training_step.json b/checkpoints/004500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..97f44a46f3389e808bf52d71d209b5cd5d38d52d --- /dev/null +++ b/checkpoints/004500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 4500 +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/config.json b/checkpoints/005000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/005000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/model.safetensors b/checkpoints/005000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fff0cabbdd790322731e3b140a72e35fe06859d5 --- /dev/null +++ b/checkpoints/005000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4c97304ced8bdbe77e849581d20925d63ea087452f0aca4224b29672332a59 +size 906712520 diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor.json b/checkpoints/005000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor.json b/checkpoints/005000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005000/pretrained_model/train_config.json b/checkpoints/005000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/005000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_param_groups.json b/checkpoints/005000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..b248a0d68eb1ea477646d5cd9a4386eb0ceb3de4 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 3.6722777168861617e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_state.safetensors b/checkpoints/005000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9b2b231dbb643792dd24c17d86cb7888d9d20f7 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3bb64d189203473dc2286bcce22fe6d2eb5095bb55c308acc90a50e0860e363 +size 412659164 diff --git a/checkpoints/005000/training_state/rng_state.safetensors b/checkpoints/005000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3fffb3041e4239e12ccbceb4ade64f42cc13c74 --- /dev/null +++ b/checkpoints/005000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb38c2ca085b335bd4d03e4ba36d18ffcdd9e043cae774be0c1953a634d4d3a +size 15708 diff --git a/checkpoints/005000/training_state/scheduler_state.json b/checkpoints/005000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cce065455809f8ca416afcb3c6b46f4aedf314bb --- /dev/null +++ b/checkpoints/005000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 5000, + "_step_count": 5001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 3.6722777168861617e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/training_step.json b/checkpoints/005000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..592449d3fc8b35c30c604eb1dabe60537e8224a0 --- /dev/null +++ b/checkpoints/005000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 5000 +} \ No newline at end of file diff --git a/checkpoints/005500/pretrained_model/config.json b/checkpoints/005500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/005500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/005500/pretrained_model/model.safetensors b/checkpoints/005500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e581501b603ddf4e25e4af877b718dd868a74389 --- /dev/null +++ b/checkpoints/005500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eb92bfc6babed4e56b4573b4cbc5e1c9c20dfc4551e15085f3ee9f8c3d55fd8 +size 906712520 diff --git a/checkpoints/005500/pretrained_model/policy_postprocessor.json b/checkpoints/005500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/005500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/005500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/005500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005500/pretrained_model/policy_preprocessor.json b/checkpoints/005500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/005500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/005500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/005500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005500/pretrained_model/train_config.json b/checkpoints/005500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/005500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/005500/training_state/optimizer_param_groups.json b/checkpoints/005500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..325eaa79ced72d5af94b15a6b04dd3850eea13b8 --- /dev/null +++ b/checkpoints/005500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.7981490199706536e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/005500/training_state/optimizer_state.safetensors b/checkpoints/005500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9977983a30c5e557d31b100c75d17964ee473be7 --- /dev/null +++ b/checkpoints/005500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc7c3460717cf02a8bcf0bd95302940f672622733325f4c99057089d85e5b29 +size 412659164 diff --git a/checkpoints/005500/training_state/rng_state.safetensors b/checkpoints/005500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0d23ea0c85585f93fe158f9eed8d0cef5344de2 --- /dev/null +++ b/checkpoints/005500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21301d5bfb5c61654fa4122d5555a05ffacaba5e664209c9812a33dbca4c5aad +size 15708 diff --git a/checkpoints/005500/training_state/scheduler_state.json b/checkpoints/005500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b1d81070479a5aee15dc3a8e053296e87575ac --- /dev/null +++ b/checkpoints/005500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 5500, + "_step_count": 5501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.7981490199706536e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/005500/training_state/training_step.json b/checkpoints/005500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..834c98398467b7a3f63786150a5f8159aa6fccfc --- /dev/null +++ b/checkpoints/005500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 5500 +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/config.json b/checkpoints/006000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a21fe7bccec9224873955ee5d47cdb13a4070ca6 --- /dev/null +++ b/checkpoints/006000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/model.safetensors b/checkpoints/006000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82c4e5192c8693d2da8d83230bc90428e4a7cc85 --- /dev/null +++ b/checkpoints/006000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976bbe2c0b3f23ce46bbb37b6ce8dc42138e488287c7178cd7047df2d5a3e33e +size 906712520 diff --git a/checkpoints/006000/pretrained_model/policy_postprocessor.json b/checkpoints/006000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/006000/pretrained_model/policy_preprocessor.json b/checkpoints/006000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..de9836d1aee4ffe5a3c4eae0a137c2fcd1c3fc28 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "/data/models/smolvlm2-500m-eval2-grounded-v1" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/006000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/006000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/006000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/006000/pretrained_model/train_config.json b/checkpoints/006000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ca4701a53ad1bff9df9d0b75d23684e5f7c58c --- /dev/null +++ b/checkpoints/006000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "root": "/data/lerobot_cache/robot-learning-group47/eval2_180_full_promptaug_clean256_v1", + "episodes": null, + "image_transforms": { + "enable": true, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": true, + "use_peft": false, + "push_to_hub": false, + "repo_id": "robot-learning-group47/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "/data/lerobot_outputs/policies/eval1final_groundedvlm_smolvla_init_v1_migrated", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 256, + 256 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "/data/models/smolvlm2-500m-eval2-grounded-v1", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/lerobot_outputs/train/eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "job_name": "eval2_fullprompt_groundedvlm_hybrid_smolvla_v1", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 6000, + "eval_freq": 6000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 500, + "use_policy_training_preset": false, + "optimizer": { + "type": "adamw", + "lr": 2e-05, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 500, + "num_decay_steps": 6000, + "peak_lr": 2e-05, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gv9sfgx9", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/006000/training_state/optimizer_param_groups.json b/checkpoints/006000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..4dd5c7a8d5234f738d1cae91c518ff28c045e1e7 --- /dev/null +++ b/checkpoints/006000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.5e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 2e-05, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/006000/training_state/optimizer_state.safetensors b/checkpoints/006000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c429a981669bc1e9c86cab4d9cbe3fb2adff4fb --- /dev/null +++ b/checkpoints/006000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106c3d03ea1aea73817c4c2330f473e36bc16ab955c7282599e6798853e00a9e +size 412659164 diff --git a/checkpoints/006000/training_state/rng_state.safetensors b/checkpoints/006000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8df2243c581054b4271cf2dafa90bcc6fae95a95 --- /dev/null +++ b/checkpoints/006000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a91f61f79de8218e1316e8f3e9a20ffe62951f1c754403a4cbcd49d6b2ab388d +size 15708 diff --git a/checkpoints/006000/training_state/scheduler_state.json b/checkpoints/006000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..73263a84416728914d7a778b1d33ad3369601062 --- /dev/null +++ b/checkpoints/006000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 2e-05 + ], + "last_epoch": 6000, + "_step_count": 6001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.5e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/006000/training_state/training_step.json b/checkpoints/006000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..e267ac589be64705f8674638b9f5099c886778da --- /dev/null +++ b/checkpoints/006000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 6000 +} \ No newline at end of file