Robotics
LeRobot
Safetensors
smolvla
Chengmingethz commited on
Commit
cd4a75d
·
verified ·
1 Parent(s): 47abe85

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +63 -24
  2. config.json +13 -13
  3. model.safetensors +2 -2
  4. train_config.json +24 -24
README.md CHANGED
@@ -1,24 +1,63 @@
1
- # eval3_sanitycheck
2
-
3
- SmolVLA sanity-check overfit policy for Eval 3 coke-can placement.
4
-
5
- - Base VLM: `HuggingFaceTB/SmolVLM2-500M-Video-Instruct`
6
- - Policy type: `smolvla`
7
- - Dataset: `robot-learning-group47/eval3_overfit20`
8
- - Local dataset root: `/data/lerobot_datasets/robot-learning-group47/eval3_overfit20`
9
- - Source episodes: `0,1,6,7,12,13,18,19,24,25,30,31,36,37,42,43,48,49,54,55`
10
- - Steps: `3000`
11
- - Batch size: `8`
12
- - Image augmentation: disabled
13
- - Trainable params: expert/state/action heads only, vision/VLM frozen
14
- - Final training loss: approximately `0.182`
15
-
16
- Important inference settings are stored in `config.json`:
17
-
18
- - `chunk_size=50`
19
- - `n_action_steps=50`
20
- - `num_steps=10`
21
- - `attention_mode=self_attn`
22
- - `num_vlm_layers=8`
23
- - `num_expert_layers=4`
24
- - `resize_imgs_with_padding=[512,512]`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: lerobot/smolvla_base
3
+ datasets: robot-learning-group47/eval3_sanity
4
+ library_name: lerobot
5
+ license: apache-2.0
6
+ model_name: smolvla
7
+ pipeline_tag: robotics
8
+ tags:
9
+ - smolvla
10
+ - robotics
11
+ - lerobot
12
+ ---
13
+
14
+ # Model Card for smolvla
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
+
21
+
22
+ This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
23
+ See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
24
+
25
+ ---
26
+
27
+ ## How to Get Started with the Model
28
+
29
+ For a complete walkthrough, see the [training guide](https://huggingface.co/docs/lerobot/il_robots#train-a-policy).
30
+ Below is the short version on how to train and run inference/eval:
31
+
32
+ ### Train from scratch
33
+
34
+ ```bash
35
+ lerobot-train \
36
+ --dataset.repo_id=${HF_USER}/<dataset> \
37
+ --policy.type=act \
38
+ --output_dir=outputs/train/<desired_policy_repo_id> \
39
+ --job_name=lerobot_training \
40
+ --policy.device=cuda \
41
+ --policy.repo_id=${HF_USER}/<desired_policy_repo_id>
42
+ --wandb.enable=true
43
+ ```
44
+
45
+ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
46
+
47
+ ### Evaluate the policy/run inference
48
+
49
+ ```bash
50
+ lerobot-record \
51
+ --robot.type=so100_follower \
52
+ --dataset.repo_id=<hf_user>/eval_<dataset> \
53
+ --policy.path=<hf_user>/<desired_policy_repo_id> \
54
+ --episodes=10
55
+ ```
56
+
57
+ Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
58
+
59
+ ---
60
+
61
+ ## Model Details
62
+
63
+ - **License:** apache-2.0
config.json CHANGED
@@ -12,8 +12,8 @@
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
- 240,
16
- 320
17
  ]
18
  }
19
  },
@@ -28,12 +28,12 @@
28
  "device": "cuda",
29
  "use_amp": false,
30
  "use_peft": false,
31
- "push_to_hub": false,
32
- "repo_id": null,
33
  "private": null,
34
  "tags": null,
35
  "license": null,
36
- "pretrained_path": null,
37
  "chunk_size": 50,
38
  "n_action_steps": 50,
39
  "normalization_mapping": {
@@ -53,8 +53,8 @@
53
  "tokenizer_max_length": 48,
54
  "num_steps": 10,
55
  "use_cache": true,
56
- "freeze_vision_encoder": true,
57
- "train_expert_only": true,
58
  "train_state_proj": true,
59
  "optimizer_lr": 0.0001,
60
  "optimizer_betas": [
@@ -63,18 +63,18 @@
63
  ],
64
  "optimizer_eps": 1e-08,
65
  "optimizer_weight_decay": 1e-10,
66
- "optimizer_grad_clip_norm": 10,
67
  "scheduler_warmup_steps": 100,
68
  "scheduler_decay_steps": 3000,
69
  "scheduler_decay_lr": 2.5e-06,
70
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
71
  "load_vlm_weights": true,
72
  "add_image_special_tokens": false,
73
- "attention_mode": "self_attn",
74
- "prefix_length": -1,
75
- "pad_language_to": "longest",
76
- "num_expert_layers": 4,
77
- "num_vlm_layers": 8,
78
  "self_attn_every_n_layers": 2,
79
  "expert_width_multiplier": 0.75,
80
  "min_period": 0.004,
 
12
  "type": "VISUAL",
13
  "shape": [
14
  3,
15
+ 256,
16
+ 256
17
  ]
18
  }
19
  },
 
28
  "device": "cuda",
29
  "use_amp": false,
30
  "use_peft": false,
31
+ "push_to_hub": true,
32
+ "repo_id": "robot-learning-group47/eval3_sanity",
33
  "private": null,
34
  "tags": null,
35
  "license": null,
36
+ "pretrained_path": "lerobot/smolvla_base",
37
  "chunk_size": 50,
38
  "n_action_steps": 50,
39
  "normalization_mapping": {
 
53
  "tokenizer_max_length": 48,
54
  "num_steps": 10,
55
  "use_cache": true,
56
+ "freeze_vision_encoder": false,
57
+ "train_expert_only": false,
58
  "train_state_proj": true,
59
  "optimizer_lr": 0.0001,
60
  "optimizer_betas": [
 
63
  ],
64
  "optimizer_eps": 1e-08,
65
  "optimizer_weight_decay": 1e-10,
66
+ "optimizer_grad_clip_norm": 10.0,
67
  "scheduler_warmup_steps": 100,
68
  "scheduler_decay_steps": 3000,
69
  "scheduler_decay_lr": 2.5e-06,
70
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
71
  "load_vlm_weights": true,
72
  "add_image_special_tokens": false,
73
+ "attention_mode": "cross_attn",
74
+ "prefix_length": 0,
75
+ "pad_language_to": "max_length",
76
+ "num_expert_layers": 0,
77
+ "num_vlm_layers": 16,
78
  "self_attn_every_n_layers": 2,
79
  "expert_width_multiplier": 0.75,
80
  "min_period": 0.004,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6de6bca545855eeae75b82f4b6728fbfdcfbaf85e6e5e3e30fe6ab9f5549d05f
3
- size 599749280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae144a7212881c51f4d1007ed5735b7108d5cf45ecf0113b04e5ee21cbbdc30
3
+ size 906712520
train_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "dataset": {
3
- "repo_id": "robot-learning-group47/eval3_overfit20",
4
- "root": "/data/lerobot_datasets/robot-learning-group47/eval3_overfit20",
5
  "episodes": null,
6
  "image_transforms": {
7
  "enable": false,
@@ -74,9 +74,9 @@
74
  }
75
  }
76
  },
77
- "revision": null,
78
  "use_imagenet_stats": true,
79
- "video_backend": "torchcodec",
80
  "return_uint8": false,
81
  "streaming": false
82
  },
@@ -95,8 +95,8 @@
95
  "type": "VISUAL",
96
  "shape": [
97
  3,
98
- 240,
99
- 320
100
  ]
101
  }
102
  },
@@ -111,12 +111,12 @@
111
  "device": "cuda",
112
  "use_amp": false,
113
  "use_peft": false,
114
- "push_to_hub": false,
115
- "repo_id": null,
116
  "private": null,
117
  "tags": null,
118
  "license": null,
119
- "pretrained_path": null,
120
  "chunk_size": 50,
121
  "n_action_steps": 50,
122
  "normalization_mapping": {
@@ -136,8 +136,8 @@
136
  "tokenizer_max_length": 48,
137
  "num_steps": 10,
138
  "use_cache": true,
139
- "freeze_vision_encoder": true,
140
- "train_expert_only": true,
141
  "train_state_proj": true,
142
  "optimizer_lr": 0.0001,
143
  "optimizer_betas": [
@@ -146,18 +146,18 @@
146
  ],
147
  "optimizer_eps": 1e-08,
148
  "optimizer_weight_decay": 1e-10,
149
- "optimizer_grad_clip_norm": 10,
150
  "scheduler_warmup_steps": 100,
151
  "scheduler_decay_steps": 3000,
152
  "scheduler_decay_lr": 2.5e-06,
153
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
154
  "load_vlm_weights": true,
155
  "add_image_special_tokens": false,
156
- "attention_mode": "self_attn",
157
- "prefix_length": -1,
158
- "pad_language_to": "longest",
159
- "num_expert_layers": 4,
160
- "num_vlm_layers": 8,
161
  "self_attn_every_n_layers": 2,
162
  "expert_width_multiplier": 0.75,
163
  "min_period": 0.004,
@@ -167,27 +167,27 @@
167
  "compile_mode": "max-autotune"
168
  },
169
  "reward_model": null,
170
- "output_dir": "outputs/train/2026-05-20/20-23-02_smolvla",
171
- "job_name": "smolvla",
172
  "resume": false,
173
  "seed": 1000,
174
  "cudnn_deterministic": false,
175
  "num_workers": 4,
176
- "batch_size": 8,
177
  "prefetch_factor": 4,
178
  "persistent_workers": true,
179
  "steps": 3000,
180
- "eval_freq": 20000,
181
  "log_freq": 50,
182
  "tolerance_s": 0.0001,
183
  "save_checkpoint": true,
184
- "save_freq": 500,
185
  "use_policy_training_preset": true,
186
  "optimizer": {
187
  "type": "adamw",
188
  "lr": 0.0001,
189
  "weight_decay": 1e-10,
190
- "grad_clip_norm": 10,
191
  "betas": [
192
  0.9,
193
  0.95
@@ -203,7 +203,7 @@
203
  },
204
  "eval": {
205
  "n_episodes": 50,
206
- "batch_size": 11,
207
  "use_async_envs": true
208
  },
209
  "wandb": {
 
1
  {
2
  "dataset": {
3
+ "repo_id": "robot-learning-group47/eval3_sanity",
4
+ "root": null,
5
  "episodes": null,
6
  "image_transforms": {
7
  "enable": false,
 
74
  }
75
  }
76
  },
77
+ "revision": "main",
78
  "use_imagenet_stats": true,
79
+ "video_backend": "pyav",
80
  "return_uint8": false,
81
  "streaming": false
82
  },
 
95
  "type": "VISUAL",
96
  "shape": [
97
  3,
98
+ 256,
99
+ 256
100
  ]
101
  }
102
  },
 
111
  "device": "cuda",
112
  "use_amp": false,
113
  "use_peft": false,
114
+ "push_to_hub": true,
115
+ "repo_id": "robot-learning-group47/eval3_sanity",
116
  "private": null,
117
  "tags": null,
118
  "license": null,
119
+ "pretrained_path": "lerobot/smolvla_base",
120
  "chunk_size": 50,
121
  "n_action_steps": 50,
122
  "normalization_mapping": {
 
136
  "tokenizer_max_length": 48,
137
  "num_steps": 10,
138
  "use_cache": true,
139
+ "freeze_vision_encoder": false,
140
+ "train_expert_only": false,
141
  "train_state_proj": true,
142
  "optimizer_lr": 0.0001,
143
  "optimizer_betas": [
 
146
  ],
147
  "optimizer_eps": 1e-08,
148
  "optimizer_weight_decay": 1e-10,
149
+ "optimizer_grad_clip_norm": 10.0,
150
  "scheduler_warmup_steps": 100,
151
  "scheduler_decay_steps": 3000,
152
  "scheduler_decay_lr": 2.5e-06,
153
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
154
  "load_vlm_weights": true,
155
  "add_image_special_tokens": false,
156
+ "attention_mode": "cross_attn",
157
+ "prefix_length": 0,
158
+ "pad_language_to": "max_length",
159
+ "num_expert_layers": 0,
160
+ "num_vlm_layers": 16,
161
  "self_attn_every_n_layers": 2,
162
  "expert_width_multiplier": 0.75,
163
  "min_period": 0.004,
 
167
  "compile_mode": "max-autotune"
168
  },
169
  "reward_model": null,
170
+ "output_dir": "outputs/train/eval3_sanity_smolvla_20260522_224000",
171
+ "job_name": "eval3_sanity_smolvla",
172
  "resume": false,
173
  "seed": 1000,
174
  "cudnn_deterministic": false,
175
  "num_workers": 4,
176
+ "batch_size": 32,
177
  "prefetch_factor": 4,
178
  "persistent_workers": true,
179
  "steps": 3000,
180
+ "eval_freq": 0,
181
  "log_freq": 50,
182
  "tolerance_s": 0.0001,
183
  "save_checkpoint": true,
184
+ "save_freq": 1000,
185
  "use_policy_training_preset": true,
186
  "optimizer": {
187
  "type": "adamw",
188
  "lr": 0.0001,
189
  "weight_decay": 1e-10,
190
+ "grad_clip_norm": 10.0,
191
  "betas": [
192
  0.9,
193
  0.95
 
203
  },
204
  "eval": {
205
  "n_episodes": 50,
206
+ "batch_size": 19,
207
  "use_async_envs": true
208
  },
209
  "wandb": {