diff --git a/checkpoints/002500/pretrained_model/config.json b/checkpoints/002500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/002500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/model.safetensors b/checkpoints/002500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a91273aec6092d5e6f429abb0750eb7e590badd --- /dev/null +++ b/checkpoints/002500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990c0b627d7a4cc56e11c3863beba28ab6408e1432b624af75d6528949c9817d +size 906712520 diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor.json b/checkpoints/002500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor.json b/checkpoints/002500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/002500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/002500/pretrained_model/train_config.json b/checkpoints/002500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/002500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_param_groups.json b/checkpoints/002500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..4a8eb6995b136451c178c1d02a60c63986f7c065 --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.833888403159208e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/002500/training_state/optimizer_state.safetensors b/checkpoints/002500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2ac5179a49cf4ab629d0090d55641185e4f21150 --- /dev/null +++ b/checkpoints/002500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2e025dd75c1642ab8ba2f45c0b2ce007b3af1b21d9154f514dc6cb95f31a8da +size 412659164 diff --git a/checkpoints/002500/training_state/rng_state.safetensors b/checkpoints/002500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ac83e521b4e823880af8e6b02a1de51a4b41377 --- /dev/null +++ b/checkpoints/002500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb524ae8796f84429ba435ee68ba14b000ae2fee45eef29e1a588596246fbf5 +size 15708 diff --git a/checkpoints/002500/training_state/scheduler_state.json b/checkpoints/002500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2243c269694e95eb290978786e4c9526df1b7a6a --- /dev/null +++ b/checkpoints/002500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 2500, + "_step_count": 2501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.833888403159208e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/002500/training_state/training_step.json b/checkpoints/002500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..d265f47a09c2910099ed59e197b57b34675d1ae0 --- /dev/null +++ b/checkpoints/002500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 2500 +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/config.json b/checkpoints/005000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/005000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/model.safetensors b/checkpoints/005000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f5887c3702fc44d3adaffad41999650849e6add --- /dev/null +++ b/checkpoints/005000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9006d084c8cf97f94e88fa9e29003b3a221f2526d7af4b621c871bd65300cb0b +size 906712520 diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor.json b/checkpoints/005000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor.json b/checkpoints/005000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/005000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/005000/pretrained_model/train_config.json b/checkpoints/005000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/005000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_param_groups.json b/checkpoints/005000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..da3ff6fe3c49bde113b5e51b8f07a79834a00a55 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.34687384344914e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/005000/training_state/optimizer_state.safetensors b/checkpoints/005000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80ed5d2d05469e48f55983f808613bdf9b1af933 --- /dev/null +++ b/checkpoints/005000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ecdce337f36f76648d017bcf7f0c03796e0513599b9fa836d4f0b096c66d30d +size 412659164 diff --git a/checkpoints/005000/training_state/rng_state.safetensors b/checkpoints/005000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f8be1db94dc6b57da6bdcacc94f46dbd6b9f9f1 --- /dev/null +++ b/checkpoints/005000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfc68f0be14dcabf3e723a3df64785a3374f4a81792289b760665dcb7d93e732 +size 15708 diff --git a/checkpoints/005000/training_state/scheduler_state.json b/checkpoints/005000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26e28372a197de2e43c4da4b379b2995d3c971c2 --- /dev/null +++ b/checkpoints/005000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 5000, + "_step_count": 5001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.34687384344914e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/005000/training_state/training_step.json b/checkpoints/005000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..592449d3fc8b35c30c604eb1dabe60537e8224a0 --- /dev/null +++ b/checkpoints/005000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 5000 +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/config.json b/checkpoints/007500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/007500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/model.safetensors b/checkpoints/007500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e8e626bd191cd67c54e0f679579173c10f16096 --- /dev/null +++ b/checkpoints/007500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382992eae7ec57f4fec4367abea53a0b4a92fcf5494b9b5abfc66c44c40f65e6 +size 906712520 diff --git a/checkpoints/007500/pretrained_model/policy_postprocessor.json b/checkpoints/007500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/007500/pretrained_model/policy_preprocessor.json b/checkpoints/007500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/007500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/007500/pretrained_model/train_config.json b/checkpoints/007500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/007500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/007500/training_state/optimizer_param_groups.json b/checkpoints/007500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..43575e068ea049c658a0470ace2668ea091c6723 --- /dev/null +++ b/checkpoints/007500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 8.572145558284418e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/007500/training_state/optimizer_state.safetensors b/checkpoints/007500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05e78bb54e2f8dd3a73e260996fb50724aada7e5 --- /dev/null +++ b/checkpoints/007500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7682ad5a12fbdbf8915d77b2058c7dfd35a09276769f702b45adb92dfc2296a +size 412659164 diff --git a/checkpoints/007500/training_state/rng_state.safetensors b/checkpoints/007500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4744f8387925918d4cfc3fcf4a35cb61b4dbbb02 --- /dev/null +++ b/checkpoints/007500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a26085319ddc9e849b5e69cff410514134af90a449e10265ec89007388d3ba +size 15708 diff --git a/checkpoints/007500/training_state/scheduler_state.json b/checkpoints/007500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af1e2297bc2901f61499236a41155372547389a8 --- /dev/null +++ b/checkpoints/007500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 7500, + "_step_count": 7501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 8.572145558284418e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/007500/training_state/training_step.json b/checkpoints/007500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..119265efe00a465e1aea6ca140089eb7aa8795c0 --- /dev/null +++ b/checkpoints/007500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 7500 +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/config.json b/checkpoints/010000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/010000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/model.safetensors b/checkpoints/010000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d4695c5f4f376adc2c82a1908ea6f16ea5bae5ee --- /dev/null +++ b/checkpoints/010000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01b0c9e4b6969a8072f918dcddb869c989f48ba14905ddf2ecad557e0432ecc9 +size 906712520 diff --git a/checkpoints/010000/pretrained_model/policy_postprocessor.json b/checkpoints/010000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/010000/pretrained_model/policy_preprocessor.json b/checkpoints/010000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/010000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/010000/pretrained_model/train_config.json b/checkpoints/010000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/010000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/010000/training_state/optimizer_param_groups.json b/checkpoints/010000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee4c92efd08e25593632b371807c89fc1b8ecc2 --- /dev/null +++ b/checkpoints/010000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 7.5625e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/010000/training_state/optimizer_state.safetensors b/checkpoints/010000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a694d8289122afa673cb3512cb5c4a662cc8708 --- /dev/null +++ b/checkpoints/010000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68d64402970f7587766c4b93144261235fcb3c11b777931e411cc522dac704ff +size 412659164 diff --git a/checkpoints/010000/training_state/rng_state.safetensors b/checkpoints/010000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b4d718d8d5559ecbac66d4368f22e97e68826cbd --- /dev/null +++ b/checkpoints/010000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87f24021efea5df129ed74e191d3e112bb16c4e329c4a4b0b379dcffb0113f60 +size 15708 diff --git a/checkpoints/010000/training_state/scheduler_state.json b/checkpoints/010000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a6fabdcd1c2a4515b7818e55be9009a5f7ef1e --- /dev/null +++ b/checkpoints/010000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 10000, + "_step_count": 10001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 7.5625e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/010000/training_state/training_step.json b/checkpoints/010000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..7cb7c0986e9e7461ca851ce71e95d235ae3d2732 --- /dev/null +++ b/checkpoints/010000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 10000 +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/config.json b/checkpoints/012500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/012500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/model.safetensors b/checkpoints/012500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..daea1a33aa18f433744fa5b58dc8b4f5b0b697c9 --- /dev/null +++ b/checkpoints/012500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4887ceb06dd53e9c89a33e834a025a7ea19bb1c820de0eb83650349254d66191 +size 906712520 diff --git a/checkpoints/012500/pretrained_model/policy_postprocessor.json b/checkpoints/012500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/012500/pretrained_model/policy_preprocessor.json b/checkpoints/012500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/012500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/012500/pretrained_model/train_config.json b/checkpoints/012500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/012500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/012500/training_state/optimizer_param_groups.json b/checkpoints/012500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..812a28183920482c0f8faf10ff2c929a68cffa8b --- /dev/null +++ b/checkpoints/012500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 6.38674284487479e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/012500/training_state/optimizer_state.safetensors b/checkpoints/012500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6e14f298c2265953d37f4ad3883d26f731a8808e --- /dev/null +++ b/checkpoints/012500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08dc82cfc053ebc53da091fbefd81ad879b08b521ce8349b3dc9bed7a5e08889 +size 412659164 diff --git a/checkpoints/012500/training_state/rng_state.safetensors b/checkpoints/012500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64347dafef511a7e6db2988a0a10a248f54b15bf --- /dev/null +++ b/checkpoints/012500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0281664e820cf6db60f931c675d10ea3d3e32e87b2ce829ff30c23b9a97e2ebe +size 15708 diff --git a/checkpoints/012500/training_state/scheduler_state.json b/checkpoints/012500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0852d1dea3f53d37259a91d41d454bff2f58c4da --- /dev/null +++ b/checkpoints/012500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 12500, + "_step_count": 12501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 6.38674284487479e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/012500/training_state/training_step.json b/checkpoints/012500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..6d00d1fb30e9fa29506662caa05a99b86254c107 --- /dev/null +++ b/checkpoints/012500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 12500 +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/config.json b/checkpoints/015000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/015000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/model.safetensors b/checkpoints/015000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb7c30a826b02d2c299d45fba4330c30406b8c90 --- /dev/null +++ b/checkpoints/015000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e197258653a071ded73d094ecba1082c01d218756d7fdb634f12896fac0a8a +size 906712520 diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor.json b/checkpoints/015000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor.json b/checkpoints/015000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/015000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/015000/pretrained_model/train_config.json b/checkpoints/015000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/015000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_param_groups.json b/checkpoints/015000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..9277f6533d9b54979939a62759a6fbdec0e5ac6c --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 5.125e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/015000/training_state/optimizer_state.safetensors b/checkpoints/015000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f375eda382661d6884b914690827c4e5fbdf4a8 --- /dev/null +++ b/checkpoints/015000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4319b40cc4e8db65854ae159d4813610aa0132f1a59cd00961da252ba1b6ceac +size 412659164 diff --git a/checkpoints/015000/training_state/rng_state.safetensors b/checkpoints/015000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4de60e7f584b027d162c5a0f212ebbcda35f039 --- /dev/null +++ b/checkpoints/015000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:158dc28d7c505355dab6ff6ae49c850cdee0cb4046f9e8fcb7bb0653befbc90d +size 15708 diff --git a/checkpoints/015000/training_state/scheduler_state.json b/checkpoints/015000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba6b6927e93a4428e4458c8496e3590dcbbac247 --- /dev/null +++ b/checkpoints/015000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 15000, + "_step_count": 15001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 5.125e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/015000/training_state/training_step.json b/checkpoints/015000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..5cec056c8ba7f3c2e865a0f73ae59975a0503067 --- /dev/null +++ b/checkpoints/015000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 15000 +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/config.json b/checkpoints/017500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/017500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/model.safetensors b/checkpoints/017500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5a8463969caf89b6f566d775f1869d80c798a4dc --- /dev/null +++ b/checkpoints/017500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a24100865d1f7d27acca3d0fab404c37abcd6777fec3c4d0da3499c01a9aa4db +size 906712520 diff --git a/checkpoints/017500/pretrained_model/policy_postprocessor.json b/checkpoints/017500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/017500/pretrained_model/policy_preprocessor.json b/checkpoints/017500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/017500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/017500/pretrained_model/train_config.json b/checkpoints/017500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/017500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/017500/training_state/optimizer_param_groups.json b/checkpoints/017500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc2fc45607e594353d8008c7e1d86c540c6101b --- /dev/null +++ b/checkpoints/017500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 3.863257155125212e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/017500/training_state/optimizer_state.safetensors b/checkpoints/017500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d12fe7c1fc208b101dbd8071463f33f69c9f92ed --- /dev/null +++ b/checkpoints/017500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68f9c89e7e8fd9b4833a8a5323080b332a7660ed12cd79f3bcc863ebf103f027 +size 412659164 diff --git a/checkpoints/017500/training_state/rng_state.safetensors b/checkpoints/017500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2fd4edc52e0f32f1ebffe7c2bc1cce7d25bb918a --- /dev/null +++ b/checkpoints/017500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:763d5714afbc0266f7006a5a30497edbfe30cef6aaed244f4815b3ba4499ee90 +size 15708 diff --git a/checkpoints/017500/training_state/scheduler_state.json b/checkpoints/017500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4df9452e4c2557a375083bd9f206dea2f77ab676 --- /dev/null +++ b/checkpoints/017500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 17500, + "_step_count": 17501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 3.863257155125212e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/017500/training_state/training_step.json b/checkpoints/017500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..da8b274dab5ff8a957dfac52878d5b5a54a1a31d --- /dev/null +++ b/checkpoints/017500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 17500 +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/config.json b/checkpoints/020000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/020000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/model.safetensors b/checkpoints/020000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cfc8ea019946ac79de20f7da3de028b440886f4 --- /dev/null +++ b/checkpoints/020000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08038e81663a5401cf951134958eee1bc7faceb4c0ad5419c332ac7ee261135a +size 906712520 diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor.json b/checkpoints/020000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor.json b/checkpoints/020000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/020000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/020000/pretrained_model/train_config.json b/checkpoints/020000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/020000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_param_groups.json b/checkpoints/020000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..fd995dd00b618398663d24c98ad932310a8be4b7 --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.6875000000000013e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/020000/training_state/optimizer_state.safetensors b/checkpoints/020000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04f8c6ad2da1ad395113765d950f42614ed159d4 --- /dev/null +++ b/checkpoints/020000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:967c696fe50e41de1578b844125972061a2a51a46c04ca66206bac4a4d819d3d +size 412659164 diff --git a/checkpoints/020000/training_state/rng_state.safetensors b/checkpoints/020000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dbf568f00db6d3d890521c0381b0ffdfccc7f6b6 --- /dev/null +++ b/checkpoints/020000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6194348403467c75aa3df245d62cd992782c3bb4723086602b6cb9051294c6 +size 15708 diff --git a/checkpoints/020000/training_state/scheduler_state.json b/checkpoints/020000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bc8702f14e2b5c016ee125c7113babe553dd2e53 --- /dev/null +++ b/checkpoints/020000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 20000, + "_step_count": 20001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.6875000000000013e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/020000/training_state/training_step.json b/checkpoints/020000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9bb47026c5d5237ca6fc5dbff6020dd122ea05 --- /dev/null +++ b/checkpoints/020000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 20000 +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/config.json b/checkpoints/022500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/022500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/model.safetensors b/checkpoints/022500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2364a685ad507870205b9aed43696e8b6edbe159 --- /dev/null +++ b/checkpoints/022500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:073e630a48d9e8d3e465af998c4e480c48ca5ed57d109e6d1a34d9db8576e3a7 +size 906712520 diff --git a/checkpoints/022500/pretrained_model/policy_postprocessor.json b/checkpoints/022500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/022500/pretrained_model/policy_preprocessor.json b/checkpoints/022500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/022500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/022500/pretrained_model/train_config.json b/checkpoints/022500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/022500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/022500/training_state/optimizer_param_groups.json b/checkpoints/022500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..ac647e3325da800c05c8d3cbe177c91d11973a5f --- /dev/null +++ b/checkpoints/022500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 1.677854441715581e-05, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/022500/training_state/optimizer_state.safetensors b/checkpoints/022500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ddf78844f1a32e4ed71a6afe848ba687a2c8fbbc --- /dev/null +++ b/checkpoints/022500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f4aaaecdd1314bc16ef12bc60b721f1c6099cbc9748abbc6a14cf98239e3d1 +size 412659164 diff --git a/checkpoints/022500/training_state/rng_state.safetensors b/checkpoints/022500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c093f2e6577c46a13bb27a124f1654ec346e2e75 --- /dev/null +++ b/checkpoints/022500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b8f169d0f809697e44dda7d2c08f579f6feaaa3b33923673be2f7981c25dc8f +size 15708 diff --git a/checkpoints/022500/training_state/scheduler_state.json b/checkpoints/022500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3fbe00aac481debedc95e57cd5d2a2f64a69c057 --- /dev/null +++ b/checkpoints/022500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 22500, + "_step_count": 22501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 1.677854441715581e-05 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/022500/training_state/training_step.json b/checkpoints/022500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..f806cbfd68a16a907ec083e2c16fb225eb569102 --- /dev/null +++ b/checkpoints/022500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 22500 +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/config.json b/checkpoints/025000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/025000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/model.safetensors b/checkpoints/025000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92ce814cb61eb5b02b4e1bf3c771b4882c79acc9 --- /dev/null +++ b/checkpoints/025000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76cdfe8d9bb0728efbdfe2eb802b920a47807e01342ad43640b11f9598ce59b6 +size 906712520 diff --git a/checkpoints/025000/pretrained_model/policy_postprocessor.json b/checkpoints/025000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/025000/pretrained_model/policy_preprocessor.json b/checkpoints/025000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/025000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/025000/pretrained_model/train_config.json b/checkpoints/025000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/025000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/025000/training_state/optimizer_param_groups.json b/checkpoints/025000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..e3ca9c8087502a565c9fab9b9cba038bf4a0fa88 --- /dev/null +++ b/checkpoints/025000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 9.031261565508626e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/025000/training_state/optimizer_state.safetensors b/checkpoints/025000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ff173728c453759683f3529d2e9845cbef07d7e --- /dev/null +++ b/checkpoints/025000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1540ad32f7cc678e1084cb8793a7fa9a03c3dfe9b51fa0445f8585157ae54d4a +size 412659164 diff --git a/checkpoints/025000/training_state/rng_state.safetensors b/checkpoints/025000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2577a15313fe4af8d51bdb0bab26b3e3070bc20 --- /dev/null +++ b/checkpoints/025000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f5be08d4ddbc92b15ebe2615682a7148fde147aa178cd98abac630d828dfff3 +size 15708 diff --git a/checkpoints/025000/training_state/scheduler_state.json b/checkpoints/025000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cbcf8cfe42f784efa28d35a6863ba0dc2c596011 --- /dev/null +++ b/checkpoints/025000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 25000, + "_step_count": 25001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 9.031261565508626e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/025000/training_state/training_step.json b/checkpoints/025000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..ac6903ae05e7bbaec1e2da710fdbe91d8dc3f37d --- /dev/null +++ b/checkpoints/025000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 25000 +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/config.json b/checkpoints/027500/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/027500/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/model.safetensors b/checkpoints/027500/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6bbecf49423efb1b1fc924d14b1703382fa61842 --- /dev/null +++ b/checkpoints/027500/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f654e66d0dea3074e2cdf601012ccfca5d0e23b80d4922b6ca7fb6f69c76cd90 +size 906712520 diff --git a/checkpoints/027500/pretrained_model/policy_postprocessor.json b/checkpoints/027500/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/027500/pretrained_model/policy_preprocessor.json b/checkpoints/027500/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/027500/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/027500/pretrained_model/train_config.json b/checkpoints/027500/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/027500/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/027500/training_state/optimizer_param_groups.json b/checkpoints/027500/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..d64750c4b9541926eff51c3d00b2e53cfa37c25f --- /dev/null +++ b/checkpoints/027500/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 4.161115968407925e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/027500/training_state/optimizer_state.safetensors b/checkpoints/027500/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..62808461ab0651aa97c3ef698b26ab4acb4a458d --- /dev/null +++ b/checkpoints/027500/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f2cf1a16974fafcfb11403fdf947384ceba3680ce099bc53cdb39e6427aefd1 +size 412659164 diff --git a/checkpoints/027500/training_state/rng_state.safetensors b/checkpoints/027500/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c8b6cacf921bf86ed1d05a1a6764b7b21dafd4ad --- /dev/null +++ b/checkpoints/027500/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7a32932df1b58f77e5cd01f026d882f676e24bf36511e6f448240c681e3c3a4 +size 15708 diff --git a/checkpoints/027500/training_state/scheduler_state.json b/checkpoints/027500/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c081719f3552fd72683cc831570dc198e116d92e --- /dev/null +++ b/checkpoints/027500/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 27500, + "_step_count": 27501, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 4.161115968407925e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/027500/training_state/training_step.json b/checkpoints/027500/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4b7f79233b940b70aec36bd79954b47a0e826d --- /dev/null +++ b/checkpoints/027500/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 27500 +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/config.json b/checkpoints/030000/pretrained_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..79b8601678817e2ea855bf0be734faeadd27b49a --- /dev/null +++ b/checkpoints/030000/pretrained_model/config.json @@ -0,0 +1,117 @@ +{ + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/model.safetensors b/checkpoints/030000/pretrained_model/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..17b2e779b82b212bd1d3e947d27b329b03a8674d --- /dev/null +++ b/checkpoints/030000/pretrained_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf6c5505cb5182ecb124c7546646bd1aa7fba33918d2f0b58f174113539fb54 +size 906712520 diff --git a/checkpoints/030000/pretrained_model/policy_postprocessor.json b/checkpoints/030000/pretrained_model/policy_postprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8997e5be18c67bad9377dd2cd9622ba38b5ae3 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_postprocessor.json @@ -0,0 +1,32 @@ +{ + "name": "policy_postprocessor", + "steps": [ + { + "registry_name": "unnormalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_postprocessor_step_0_unnormalizer_processor.safetensors" + }, + { + "registry_name": "device_processor", + "config": { + "device": "cpu", + "float_dtype": null + } + } + ] +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors b/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_postprocessor_step_0_unnormalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/030000/pretrained_model/policy_preprocessor.json b/checkpoints/030000/pretrained_model/policy_preprocessor.json new file mode 100644 index 0000000000000000000000000000000000000000..0f385ae81acb5b0984ccff867bbd11476774c619 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_preprocessor.json @@ -0,0 +1,105 @@ +{ + "name": "policy_preprocessor", + "steps": [ + { + "registry_name": "rename_observations_processor", + "config": { + "rename_map": { + "observation.images.front": "observation.images.camera1" + } + } + }, + { + "registry_name": "to_batch_processor", + "config": {} + }, + { + "registry_name": "smolvla_new_line_processor", + "config": {} + }, + { + "registry_name": "tokenizer_processor", + "config": { + "max_length": 48, + "task_key": "task", + "padding_side": "right", + "padding": "max_length", + "truncation": true, + "tokenizer_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + } + }, + { + "registry_name": "device_processor", + "config": { + "device": "cuda", + "float_dtype": null + } + }, + { + "registry_name": "normalizer_processor", + "config": { + "eps": 1e-08, + "features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "norm_map": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + } + }, + "state_file": "policy_preprocessor_step_5_normalizer_processor.safetensors" + } + ] +} \ No newline at end of file diff --git a/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors b/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a30e841d635b24cb6cde5c742deead9c085aa7f2 --- /dev/null +++ b/checkpoints/030000/pretrained_model/policy_preprocessor_step_5_normalizer_processor.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2e0b200b31b18207c388ca19e7f781338f3487ec5cdf922dcb369a74a9606e +size 2052 diff --git a/checkpoints/030000/pretrained_model/train_config.json b/checkpoints/030000/pretrained_model/train_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9bc8dfcaa95a8bee22612672f605911e3bc63bad --- /dev/null +++ b/checkpoints/030000/pretrained_model/train_config.json @@ -0,0 +1,257 @@ +{ + "dataset": { + "repo_id": "robot-learning-group47/eval2_180_chengming_promptaug_clean_256_v3", + "root": null, + "episodes": null, + "image_transforms": { + "enable": false, + "max_num_transforms": 3, + "random_order": false, + "tfs": { + "brightness": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "brightness": [ + 0.8, + 1.2 + ] + } + }, + "contrast": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "contrast": [ + 0.8, + 1.2 + ] + } + }, + "saturation": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "saturation": [ + 0.5, + 1.5 + ] + } + }, + "hue": { + "weight": 1.0, + "type": "ColorJitter", + "kwargs": { + "hue": [ + -0.05, + 0.05 + ] + } + }, + "sharpness": { + "weight": 1.0, + "type": "SharpnessJitter", + "kwargs": { + "sharpness": [ + 0.5, + 1.5 + ] + } + }, + "affine": { + "weight": 1.0, + "type": "RandomAffine", + "kwargs": { + "degrees": [ + -5.0, + 5.0 + ], + "translate": [ + 0.05, + 0.05 + ] + } + } + } + }, + "revision": "main", + "use_imagenet_stats": true, + "video_backend": "torchcodec", + "return_uint8": false, + "streaming": false + }, + "env": null, + "policy": { + "type": "smolvla", + "n_obs_steps": 1, + "input_features": { + "observation.state": { + "type": "STATE", + "shape": [ + 6 + ] + }, + "observation.images.camera1": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera2": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.camera3": { + "type": "VISUAL", + "shape": [ + 3, + 256, + 256 + ] + }, + "observation.images.empty_camera_0": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + }, + "observation.images.empty_camera_1": { + "type": "VISUAL", + "shape": [ + 3, + 480, + 640 + ] + } + }, + "output_features": { + "action": { + "type": "ACTION", + "shape": [ + 6 + ] + } + }, + "device": "cuda", + "use_amp": false, + "use_peft": false, + "push_to_hub": true, + "repo_id": "robot-learning-group47/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "private": null, + "tags": null, + "license": null, + "pretrained_path": "lerobot/smolvla_base", + "chunk_size": 50, + "n_action_steps": 50, + "normalization_mapping": { + "VISUAL": "IDENTITY", + "STATE": "MEAN_STD", + "ACTION": "MEAN_STD" + }, + "max_state_dim": 32, + "max_action_dim": 32, + "resize_imgs_with_padding": [ + 512, + 512 + ], + "empty_cameras": 2, + "adapt_to_pi_aloha": false, + "use_delta_joint_actions_aloha": false, + "tokenizer_max_length": 48, + "num_steps": 10, + "use_cache": true, + "freeze_vision_encoder": true, + "train_expert_only": true, + "train_state_proj": true, + "optimizer_lr": 0.0001, + "optimizer_betas": [ + 0.9, + 0.95 + ], + "optimizer_eps": 1e-08, + "optimizer_weight_decay": 1e-10, + "optimizer_grad_clip_norm": 10.0, + "scheduler_warmup_steps": 1000, + "scheduler_decay_steps": 30000, + "scheduler_decay_lr": 2.5e-06, + "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + "load_vlm_weights": true, + "add_image_special_tokens": false, + "attention_mode": "cross_attn", + "prefix_length": 0, + "pad_language_to": "max_length", + "num_expert_layers": 0, + "num_vlm_layers": 16, + "self_attn_every_n_layers": 2, + "expert_width_multiplier": 0.75, + "min_period": 0.004, + "max_period": 4.0, + "rtc_config": null, + "compile_model": false, + "compile_mode": "max-autotune" + }, + "reward_model": null, + "output_dir": "/data/outputs/train/eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "job_name": "eval2_180_clean_256_v3_promptaug_smolvla_base_30k", + "resume": false, + "seed": 1000, + "cudnn_deterministic": false, + "num_workers": 12, + "batch_size": 64, + "prefetch_factor": 4, + "persistent_workers": true, + "steps": 30000, + "eval_freq": 20000, + "log_freq": 100, + "tolerance_s": 0.0001, + "save_checkpoint": true, + "save_freq": 2500, + "use_policy_training_preset": true, + "optimizer": { + "type": "adamw", + "lr": 0.0001, + "weight_decay": 1e-10, + "grad_clip_norm": 10.0, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08 + }, + "scheduler": { + "type": "cosine_decay_with_warmup", + "num_warmup_steps": 1000, + "num_decay_steps": 30000, + "peak_lr": 0.0001, + "decay_lr": 2.5e-06 + }, + "eval": { + "n_episodes": 50, + "batch_size": 11, + "use_async_envs": true + }, + "wandb": { + "enable": true, + "disable_artifact": false, + "project": "lerobot", + "entity": null, + "notes": null, + "run_id": "gc4bjq78", + "mode": null, + "add_tags": true + }, + "peft": null, + "sample_weighting": null, + "rename_map": { + "observation.images.front": "observation.images.camera1" + }, + "checkpoint_path": null +} \ No newline at end of file diff --git a/checkpoints/030000/training_state/optimizer_param_groups.json b/checkpoints/030000/training_state/optimizer_param_groups.json new file mode 100644 index 0000000000000000000000000000000000000000..432e25d9d9051a91f53e76a5753722c07eba5dab --- /dev/null +++ b/checkpoints/030000/training_state/optimizer_param_groups.json @@ -0,0 +1,521 @@ +[ + { + "lr": 2.5e-06, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "weight_decay": 1e-10, + "amsgrad": false, + "maximize": false, + "foreach": null, + "capturable": false, + "differentiable": false, + "fused": null, + "decoupled_weight_decay": true, + "initial_lr": 0.0001, + "params": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 299, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 310, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 319, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 340, + 341, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 371, + 372, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 402, + 403, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 412, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 422, + 423, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 453, + 454, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 467, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 486, + 487, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 497, + 498, + 499 + ] + } +] \ No newline at end of file diff --git a/checkpoints/030000/training_state/optimizer_state.safetensors b/checkpoints/030000/training_state/optimizer_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7e83df06c6aaefb03bd5477ecb99438507b7840d --- /dev/null +++ b/checkpoints/030000/training_state/optimizer_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:688bc765a3f799ea1848220df5d9921f17c78f0413e1f76a42bd829015631541 +size 412659164 diff --git a/checkpoints/030000/training_state/rng_state.safetensors b/checkpoints/030000/training_state/rng_state.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48816481e3804c6bc07d82e65dd2bf248c62c583 --- /dev/null +++ b/checkpoints/030000/training_state/rng_state.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:481460ad134400f69117dfa41a37ec871ddd3cea6052b329a46355047f565be3 +size 15708 diff --git a/checkpoints/030000/training_state/scheduler_state.json b/checkpoints/030000/training_state/scheduler_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a09e9ada2b0bc0b2f519fcd89834a9e8725ec165 --- /dev/null +++ b/checkpoints/030000/training_state/scheduler_state.json @@ -0,0 +1,15 @@ +{ + "base_lrs": [ + 0.0001 + ], + "last_epoch": 30000, + "_step_count": 30001, + "_is_initial": false, + "_get_lr_called_within_step": false, + "_last_lr": [ + 2.5e-06 + ], + "lr_lambdas": [ + null + ] +} \ No newline at end of file diff --git a/checkpoints/030000/training_state/training_step.json b/checkpoints/030000/training_state/training_step.json new file mode 100644 index 0000000000000000000000000000000000000000..f4945f660f45b332883dccfccf18d8b8815d916a --- /dev/null +++ b/checkpoints/030000/training_state/training_step.json @@ -0,0 +1,3 @@ +{ + "step": 30000 +} \ No newline at end of file