MayerZhu1995 commited on
Commit
eb7b30d
·
verified ·
1 Parent(s): 62aac99

Add gr00t_qwen3vl_0.6b_libero checkpoint

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  pi05_libero/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  pi05_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  pi0_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
36
  pi05_libero/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  pi05_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  pi0_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ gr00t_qwen3vl_0.6b_libero/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
gr00t_qwen3vl_0.6b_libero/checkpoints/step-104160-epoch-24-loss=0.0550.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0c1a3af779899af469531fb8df0b4c5b7c1c6306a27e5ee00f68ac427cd3e6b
3
+ size 25309396498
gr00t_qwen3vl_0.6b_libero/config.json ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval": {
3
+ "dataset": {
4
+ "transforms": [
5
+ {
6
+ "embodiment_id": 2,
7
+ "img_keys": [
8
+ "agentview_image",
9
+ "robot0_eye_in_hand_image"
10
+ ],
11
+ "type": "ProcessLiberoEvalInputs"
12
+ },
13
+ {
14
+ "type": "ConvertPILImageToNumpyArray"
15
+ },
16
+ {
17
+ "image_mean": [
18
+ 0.48145466,
19
+ 0.4578275,
20
+ 0.40821073
21
+ ],
22
+ "image_std": [
23
+ 0.26862954,
24
+ 0.26130258,
25
+ 0.27577711
26
+ ],
27
+ "img_key": "pixel_values",
28
+ "max_pixels": 1003520,
29
+ "merge_size": 2,
30
+ "min_pixels": 3136,
31
+ "patch_size": 16,
32
+ "temporal_patch_size": 2,
33
+ "to_tensor": true,
34
+ "type": "QWen2VLImageTransform"
35
+ },
36
+ {
37
+ "tokenizer": {
38
+ "model_path": "./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/",
39
+ "type": "PretrainedTokenizer"
40
+ },
41
+ "type": "LiberoPromptFromInputs"
42
+ },
43
+ {
44
+ "gripper_key": "robot0_gripper_qpos",
45
+ "norm_type": "mean_std",
46
+ "out_key": "states",
47
+ "pos_key": "robot0_eef_pos",
48
+ "quat_key": "robot0_eef_quat",
49
+ "state_dim": 64,
50
+ "type": "LiberoProprioFromInputs"
51
+ }
52
+ ],
53
+ "type": "LiberoParquetEvalDataset"
54
+ },
55
+ "denormalize_action": {
56
+ "norm_type": "mean_std",
57
+ "type": "DenormalizeLiberoAction"
58
+ },
59
+ "eval_chunk_size": 10,
60
+ "model_family": "pi0",
61
+ "num_steps_wait": 10,
62
+ "num_trials_per_task": 50,
63
+ "resize_size": 224,
64
+ "seed": 7,
65
+ "task_suite_name": "libero_10",
66
+ "type": "LiberoEvalRunner"
67
+ },
68
+ "inference_model": {
69
+ "freeze_projector": false,
70
+ "freeze_vlm_backbone": false,
71
+ "name_mapping": null,
72
+ "pretrained_name_or_path": null,
73
+ "type": "LlavaVLA",
74
+ "vla_head": {
75
+ "action_dim": 32,
76
+ "backbone_embedding_dim": 2048,
77
+ "diffusion_model_cfg": {
78
+ "attention_head_dim": 48,
79
+ "cross_attention_dim": 2048,
80
+ "dropout": 0.2,
81
+ "final_dropout": true,
82
+ "interleave_self_attention": true,
83
+ "norm_type": "ada_norm",
84
+ "num_attention_heads": 32,
85
+ "num_layers": 16,
86
+ "output_dim": 1024,
87
+ "positional_embeddings": null
88
+ },
89
+ "hidden_size": 1024,
90
+ "input_embedding_dim": 1536,
91
+ "num_heads": 4,
92
+ "num_inference_timesteps": 4,
93
+ "num_layers": 1,
94
+ "ori_action_dim": 7,
95
+ "state_dim": 64,
96
+ "traj_length": 10,
97
+ "type": "FlowMatchingHead",
98
+ "vl_self_attention_cfg": {
99
+ "attention_head_dim": 64,
100
+ "dropout": 0.2,
101
+ "final_dropout": true,
102
+ "num_attention_heads": 32,
103
+ "num_layers": 4,
104
+ "positional_embeddings": null
105
+ }
106
+ },
107
+ "vlm_backbone": {
108
+ "attn_implementation": "sdpa",
109
+ "projection_output_dim": 2048,
110
+ "projection_type": "linear",
111
+ "type": "Qwen3VL",
112
+ "use_projection": true,
113
+ "vlm_backbone_id": "qwen3_0.6b_vl_pt",
114
+ "vlm_config": null,
115
+ "vlm_path": "./check;points/LimX-Qwen3-VL-0.6B-stage1-20260223/"
116
+ }
117
+ },
118
+ "model": {
119
+ "freeze_projector": false,
120
+ "freeze_vlm_backbone": false,
121
+ "name_mapping": {
122
+ "vla_head": "action_head"
123
+ },
124
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
125
+ "strict_mapping": false,
126
+ "type": "LlavaVLA",
127
+ "vla_head": {
128
+ "action_dim": 32,
129
+ "backbone_embedding_dim": 2048,
130
+ "diffusion_model_cfg": {
131
+ "attention_head_dim": 48,
132
+ "cross_attention_dim": 2048,
133
+ "dropout": 0.2,
134
+ "final_dropout": true,
135
+ "interleave_self_attention": true,
136
+ "norm_type": "ada_norm",
137
+ "num_attention_heads": 32,
138
+ "num_layers": 16,
139
+ "output_dim": 1024,
140
+ "positional_embeddings": null
141
+ },
142
+ "hidden_size": 1024,
143
+ "input_embedding_dim": 1536,
144
+ "num_heads": 4,
145
+ "num_inference_timesteps": 4,
146
+ "num_layers": 1,
147
+ "ori_action_dim": 7,
148
+ "state_dim": 64,
149
+ "traj_length": 10,
150
+ "type": "FlowMatchingHead",
151
+ "vl_self_attention_cfg": {
152
+ "attention_head_dim": 64,
153
+ "dropout": 0.2,
154
+ "final_dropout": true,
155
+ "num_attention_heads": 32,
156
+ "num_layers": 4,
157
+ "positional_embeddings": null
158
+ }
159
+ },
160
+ "vlm_backbone": {
161
+ "attn_implementation": "sdpa",
162
+ "projection_output_dim": 2048,
163
+ "projection_type": "linear",
164
+ "type": "Qwen3VL",
165
+ "use_projection": true,
166
+ "vlm_backbone_id": "qwen3_0.6b_vl_pt",
167
+ "vlm_config": null,
168
+ "vlm_path": "./check;points/LimX-Qwen3-VL-0.6B-stage1-20260223/"
169
+ }
170
+ },
171
+ "runner": {
172
+ "change_key_name": false,
173
+ "collator": {
174
+ "keys": [
175
+ "states",
176
+ "observation.eepose",
177
+ "timestamp",
178
+ "images",
179
+ "img_masks",
180
+ "lang_tokens",
181
+ "lang_masks",
182
+ "actions",
183
+ "action_masks",
184
+ "embodiment_ids",
185
+ "image_grid_thw"
186
+ ],
187
+ "meta_keys": [
188
+ "task_description",
189
+ "prompt",
190
+ "info",
191
+ "stats"
192
+ ],
193
+ "type": "DictCollator"
194
+ },
195
+ "enable_gradient_checkpointing": false,
196
+ "enable_mixed_precision_training": true,
197
+ "learning_rate": 1.5e-05,
198
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
199
+ "max_epochs": 24,
200
+ "max_grad_norm": 1.0,
201
+ "metric": {
202
+ "active_trackers": [
203
+ "jsonl",
204
+ "wandb"
205
+ ],
206
+ "grad_accumulation_steps": 1,
207
+ "run_dir": "work_dirs",
208
+ "type": "VLAMetric",
209
+ "window_size": 1
210
+ },
211
+ "mixed_precision_dtype": "bf16",
212
+ "sampler": null,
213
+ "sharding_strategy": "full-shard",
214
+ "tokenizer": {
215
+ "model_path": "./check;points/LimX-Qwen3-VL-0.6B-stage1-20260223/",
216
+ "type": "PretrainedTokenizer"
217
+ },
218
+ "type": "FSDPTrainRunner",
219
+ "warmup_ratio": 0.03,
220
+ "weight_decay": 0.0
221
+ },
222
+ "train_dataloader": {
223
+ "dataset": {
224
+ "dataset_statistics": {
225
+ "libero_10_no_noops": {
226
+ "action": {
227
+ "max": [
228
+ 0.9375,
229
+ 0.9375,
230
+ 0.9375,
231
+ 0.32892856001853943,
232
+ 0.36964285373687744,
233
+ 0.375,
234
+ 1.0
235
+ ],
236
+ "mean": [
237
+ 0.01905656634877842,
238
+ 0.05672475971568838,
239
+ -0.056239289430234256,
240
+ 0.004756678478841528,
241
+ 0.002797492338491304,
242
+ -0.00714607048416358,
243
+ 0.54599156235075
244
+ ],
245
+ "min": [
246
+ -0.9375,
247
+ -0.9375,
248
+ -0.9375,
249
+ -0.23642857372760773,
250
+ -0.3053571283817291,
251
+ -0.3642857074737549,
252
+ 0.0
253
+ ],
254
+ "q01": [
255
+ -0.4997477764535965,
256
+ -0.6992653512084763,
257
+ -0.6543309163615124,
258
+ -0.07417070079989778,
259
+ -0.11898748445770971,
260
+ -0.15976085962510805,
261
+ 0.0
262
+ ],
263
+ "q99": [
264
+ 0.658747846713789,
265
+ 0.7333480638990948,
266
+ 0.768601965587579,
267
+ 0.09784501244893279,
268
+ 0.12943469061349036,
269
+ 0.15137893471596325,
270
+ 1.0
271
+ ],
272
+ "std": [
273
+ 0.10588348353857541,
274
+ 0.13552477199270377,
275
+ 0.13886650724555177,
276
+ 0.01433739270759898,
277
+ 0.02038583948325967,
278
+ 0.033299202425577934,
279
+ 0.1881810653484855
280
+ ]
281
+ },
282
+ "proprio": {
283
+ "max": [
284
+ 0.2103137969970703,
285
+ 0.38887521624565125,
286
+ 1.333192229270935,
287
+ 3.7248642444610596,
288
+ 3.5618896484375,
289
+ 1.3863215446472168,
290
+ 0.041575800627470016,
291
+ 0.0013126095291227102
292
+ ],
293
+ "mean": [
294
+ -0.0419132679050224,
295
+ 0.034591788297521735,
296
+ 0.8265881844959498,
297
+ 2.90259518190321,
298
+ -0.5570652600832564,
299
+ -0.16592166873533284,
300
+ 0.02845031351083622,
301
+ -0.02880236273799356
302
+ ],
303
+ "min": [
304
+ -0.48278069496154785,
305
+ -0.3309336006641388,
306
+ 0.44550687074661255,
307
+ 1.1323540210723877,
308
+ -3.6312508583068848,
309
+ -1.842738389968872,
310
+ -0.005453015677630901,
311
+ -0.04112039878964424
312
+ ],
313
+ "q01": [
314
+ -0.1855636807291125,
315
+ -0.16145669766439186,
316
+ 0.7064185725262808,
317
+ 2.5678211534702324,
318
+ -1.2430377303522737,
319
+ -0.5195810482339626,
320
+ 0.01022917473133343,
321
+ -0.03999379658232052
322
+ ],
323
+ "q99": [
324
+ 0.05938728483051665,
325
+ 0.2361478409238694,
326
+ 0.9397258571145816,
327
+ 3.2118708728143526,
328
+ 0.49082919816100534,
329
+ 0.2100883989120329,
330
+ 0.040047131839991014,
331
+ -0.011104049991952391
332
+ ],
333
+ "std": [
334
+ 0.03756502182067285,
335
+ 0.05091765880150317,
336
+ 0.09107525593038836,
337
+ 0.12327524826514363,
338
+ 0.4418352294043351,
339
+ 0.12490994022681218,
340
+ 0.004662133639412193,
341
+ 0.00460807817987938
342
+ ]
343
+ },
344
+ "timestamp": {
345
+ "max": [
346
+ 25.2
347
+ ],
348
+ "mean": [
349
+ 7.007510548523206
350
+ ],
351
+ "min": [
352
+ 0.0
353
+ ],
354
+ "q01": null,
355
+ "q99": null,
356
+ "std": [
357
+ 4.457129586378845
358
+ ]
359
+ }
360
+ }
361
+ },
362
+ "datasets": {
363
+ "action_key": "action",
364
+ "action_window_size": 10,
365
+ "data_root_path": [
366
+ "datasets/libero_10_lerobot",
367
+ "datasets/libero_goal_no_noops_lerobotv2.1",
368
+ "datasets/libero_spatial_lerobot",
369
+ "datasets/libero_object_lerobot"
370
+ ],
371
+ "statistic_name": "libero_10_no_noops",
372
+ "transforms": [
373
+ {
374
+ "embodiment_id": 2,
375
+ "name_mappings": {
376
+ "actions": [
377
+ "actions"
378
+ ],
379
+ "observation.state": [
380
+ "states"
381
+ ]
382
+ },
383
+ "parquet_keys": [
384
+ "observation.state",
385
+ "timestamp",
386
+ "actions",
387
+ "info",
388
+ "stats",
389
+ "action_masks"
390
+ ],
391
+ "type": "ProcessParquetInputs",
392
+ "video_keys": [
393
+ "observation.images.image",
394
+ "observation.images.wrist_image"
395
+ ]
396
+ },
397
+ {
398
+ "type": "ParquetPrompter"
399
+ },
400
+ {
401
+ "tokenizer": {
402
+ "model_path": "./check;points/LimX-Qwen3-VL-0.6B-stage1-20260223/",
403
+ "type": "PretrainedTokenizer"
404
+ },
405
+ "type": "ProcessPrompts"
406
+ },
407
+ {
408
+ "height": 224,
409
+ "type": "ResizeImages",
410
+ "width": 224
411
+ },
412
+ {
413
+ "image_mean": [
414
+ 0.48145466,
415
+ 0.4578275,
416
+ 0.40821073
417
+ ],
418
+ "image_std": [
419
+ 0.26862954,
420
+ 0.26130258,
421
+ 0.27577711
422
+ ],
423
+ "max_pixels": 1003520,
424
+ "merge_size": 2,
425
+ "min_pixels": 3136,
426
+ "patch_size": 16,
427
+ "temporal_patch_size": 2,
428
+ "type": "QWen2VLImageTransform"
429
+ },
430
+ {
431
+ "action_dim": 32,
432
+ "action_key": "action",
433
+ "norm_type": "mean_std",
434
+ "state_dim": 64,
435
+ "state_key": "proprio",
436
+ "type": "NormalizeStatesAndActions"
437
+ }
438
+ ],
439
+ "type": "ParquetDataset",
440
+ "use_delta": false,
441
+ "window_start_idx": 0
442
+ },
443
+ "name_mappings": {
444
+ "action": [
445
+ "action"
446
+ ],
447
+ "observation.state": [
448
+ "proprio"
449
+ ]
450
+ },
451
+ "statistic_keys": [
452
+ "observation.state",
453
+ "timestamp",
454
+ "action"
455
+ ],
456
+ "statistic_name": "libero_10_no_noops",
457
+ "type": "DistributedRepeatingDataset"
458
+ },
459
+ "per_device_batch_size": 8,
460
+ "per_device_num_workers": 4
461
+ }
462
+ }
gr00t_qwen3vl_0.6b_libero/config.yaml ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eval:
2
+ dataset:
3
+ transforms:
4
+ - embodiment_id: 2
5
+ img_keys:
6
+ - agentview_image
7
+ - robot0_eye_in_hand_image
8
+ type: ProcessLiberoEvalInputs
9
+ - type: ConvertPILImageToNumpyArray
10
+ - image_mean:
11
+ - 0.48145466
12
+ - 0.4578275
13
+ - 0.40821073
14
+ image_std:
15
+ - 0.26862954
16
+ - 0.26130258
17
+ - 0.27577711
18
+ img_key: pixel_values
19
+ max_pixels: 1003520
20
+ merge_size: 2
21
+ min_pixels: 3136
22
+ patch_size: 16
23
+ temporal_patch_size: 2
24
+ to_tensor: true
25
+ type: QWen2VLImageTransform
26
+ - tokenizer:
27
+ model_path: ./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/
28
+ type: PretrainedTokenizer
29
+ type: LiberoPromptFromInputs
30
+ - gripper_key: robot0_gripper_qpos
31
+ norm_type: mean_std
32
+ out_key: states
33
+ pos_key: robot0_eef_pos
34
+ quat_key: robot0_eef_quat
35
+ state_dim: 64
36
+ type: LiberoProprioFromInputs
37
+ type: LiberoParquetEvalDataset
38
+ denormalize_action:
39
+ norm_type: mean_std
40
+ type: DenormalizeLiberoAction
41
+ eval_chunk_size: 10
42
+ model_family: pi0
43
+ num_steps_wait: 10
44
+ num_trials_per_task: 50
45
+ resize_size: 224
46
+ seed: 7
47
+ task_suite_name: libero_10
48
+ type: LiberoEvalRunner
49
+ inference_model:
50
+ freeze_projector: false
51
+ freeze_vlm_backbone: false
52
+ name_mapping: null
53
+ pretrained_name_or_path: null
54
+ type: LlavaVLA
55
+ vla_head:
56
+ action_dim: 32
57
+ backbone_embedding_dim: 2048
58
+ diffusion_model_cfg:
59
+ attention_head_dim: 48
60
+ cross_attention_dim: 2048
61
+ dropout: 0.2
62
+ final_dropout: true
63
+ interleave_self_attention: true
64
+ norm_type: ada_norm
65
+ num_attention_heads: 32
66
+ num_layers: 16
67
+ output_dim: 1024
68
+ positional_embeddings: null
69
+ hidden_size: 1024
70
+ input_embedding_dim: 1536
71
+ num_heads: 4
72
+ num_inference_timesteps: 4
73
+ num_layers: 1
74
+ ori_action_dim: 7
75
+ state_dim: 64
76
+ traj_length: 10
77
+ type: FlowMatchingHead
78
+ vl_self_attention_cfg:
79
+ attention_head_dim: 64
80
+ dropout: 0.2
81
+ final_dropout: true
82
+ num_attention_heads: 32
83
+ num_layers: 4
84
+ positional_embeddings: null
85
+ vlm_backbone:
86
+ attn_implementation: sdpa
87
+ projection_output_dim: 2048
88
+ projection_type: linear
89
+ type: Qwen3VL
90
+ use_projection: true
91
+ vlm_backbone_id: qwen3_0.6b_vl_pt
92
+ vlm_config: null
93
+ vlm_path: ./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/
94
+ model:
95
+ freeze_projector: false
96
+ freeze_vlm_backbone: false
97
+ name_mapping:
98
+ vla_head: action_head
99
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
100
+ strict_mapping: false
101
+ type: LlavaVLA
102
+ vla_head:
103
+ action_dim: 32
104
+ backbone_embedding_dim: 2048
105
+ diffusion_model_cfg:
106
+ attention_head_dim: 48
107
+ cross_attention_dim: 2048
108
+ dropout: 0.2
109
+ final_dropout: true
110
+ interleave_self_attention: true
111
+ norm_type: ada_norm
112
+ num_attention_heads: 32
113
+ num_layers: 16
114
+ output_dim: 1024
115
+ positional_embeddings: null
116
+ hidden_size: 1024
117
+ input_embedding_dim: 1536
118
+ num_heads: 4
119
+ num_inference_timesteps: 4
120
+ num_layers: 1
121
+ ori_action_dim: 7
122
+ state_dim: 64
123
+ traj_length: 10
124
+ type: FlowMatchingHead
125
+ vl_self_attention_cfg:
126
+ attention_head_dim: 64
127
+ dropout: 0.2
128
+ final_dropout: true
129
+ num_attention_heads: 32
130
+ num_layers: 4
131
+ positional_embeddings: null
132
+ vlm_backbone:
133
+ attn_implementation: sdpa
134
+ projection_output_dim: 2048
135
+ projection_type: linear
136
+ type: Qwen3VL
137
+ use_projection: true
138
+ vlm_backbone_id: qwen3_0.6b_vl_pt
139
+ vlm_config: null
140
+ vlm_path: ./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/
141
+ runner:
142
+ change_key_name: false
143
+ collator:
144
+ keys:
145
+ - states
146
+ - observation.eepose
147
+ - timestamp
148
+ - images
149
+ - img_masks
150
+ - lang_tokens
151
+ - lang_masks
152
+ - actions
153
+ - action_masks
154
+ - embodiment_ids
155
+ - image_grid_thw
156
+ meta_keys:
157
+ - task_description
158
+ - prompt
159
+ - info
160
+ - stats
161
+ type: DictCollator
162
+ enable_gradient_checkpointing: false
163
+ enable_mixed_precision_training: true
164
+ learning_rate: 1.5e-05
165
+ lr_scheduler_type: linear-warmup+cosine-decay
166
+ max_epochs: 24
167
+ max_grad_norm: 1.0
168
+ metric:
169
+ active_trackers:
170
+ - jsonl
171
+ - wandb
172
+ grad_accumulation_steps: 1
173
+ run_dir: work_dirs
174
+ type: VLAMetric
175
+ window_size: 1
176
+ mixed_precision_dtype: bf16
177
+ sampler: null
178
+ sharding_strategy: full-shard
179
+ tokenizer:
180
+ model_path: ./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/
181
+ type: PretrainedTokenizer
182
+ type: FSDPTrainRunner
183
+ warmup_ratio: 0.03
184
+ weight_decay: 0.0
185
+ train_dataloader:
186
+ dataset:
187
+ dataset_statistics:
188
+ libero_10_no_noops:
189
+ action:
190
+ max:
191
+ - 0.9375
192
+ - 0.9375
193
+ - 0.9375
194
+ - 0.32892856001853943
195
+ - 0.36964285373687744
196
+ - 0.375
197
+ - 1.0
198
+ mean:
199
+ - 0.01905656634877842
200
+ - 0.05672475971568838
201
+ - -0.056239289430234256
202
+ - 0.004756678478841528
203
+ - 0.002797492338491304
204
+ - -0.00714607048416358
205
+ - 0.54599156235075
206
+ min:
207
+ - -0.9375
208
+ - -0.9375
209
+ - -0.9375
210
+ - -0.23642857372760773
211
+ - -0.3053571283817291
212
+ - -0.3642857074737549
213
+ - 0.0
214
+ q01:
215
+ - -0.4997477764535965
216
+ - -0.6992653512084763
217
+ - -0.6543309163615124
218
+ - -0.07417070079989778
219
+ - -0.11898748445770971
220
+ - -0.15976085962510805
221
+ - 0.0
222
+ q99:
223
+ - 0.658747846713789
224
+ - 0.7333480638990948
225
+ - 0.768601965587579
226
+ - 0.09784501244893279
227
+ - 0.12943469061349036
228
+ - 0.15137893471596325
229
+ - 1.0
230
+ std:
231
+ - 0.10588348353857541
232
+ - 0.13552477199270377
233
+ - 0.13886650724555177
234
+ - 0.01433739270759898
235
+ - 0.02038583948325967
236
+ - 0.033299202425577934
237
+ - 0.1881810653484855
238
+ proprio:
239
+ max:
240
+ - 0.2103137969970703
241
+ - 0.38887521624565125
242
+ - 1.333192229270935
243
+ - 3.7248642444610596
244
+ - 3.5618896484375
245
+ - 1.3863215446472168
246
+ - 0.041575800627470016
247
+ - 0.0013126095291227102
248
+ mean:
249
+ - -0.0419132679050224
250
+ - 0.034591788297521735
251
+ - 0.8265881844959498
252
+ - 2.90259518190321
253
+ - -0.5570652600832564
254
+ - -0.16592166873533284
255
+ - 0.02845031351083622
256
+ - -0.02880236273799356
257
+ min:
258
+ - -0.48278069496154785
259
+ - -0.3309336006641388
260
+ - 0.44550687074661255
261
+ - 1.1323540210723877
262
+ - -3.6312508583068848
263
+ - -1.842738389968872
264
+ - -0.005453015677630901
265
+ - -0.04112039878964424
266
+ q01:
267
+ - -0.1855636807291125
268
+ - -0.16145669766439186
269
+ - 0.7064185725262808
270
+ - 2.5678211534702324
271
+ - -1.2430377303522737
272
+ - -0.5195810482339626
273
+ - 0.01022917473133343
274
+ - -0.03999379658232052
275
+ q99:
276
+ - 0.05938728483051665
277
+ - 0.2361478409238694
278
+ - 0.9397258571145816
279
+ - 3.2118708728143526
280
+ - 0.49082919816100534
281
+ - 0.2100883989120329
282
+ - 0.040047131839991014
283
+ - -0.011104049991952391
284
+ std:
285
+ - 0.03756502182067285
286
+ - 0.05091765880150317
287
+ - 0.09107525593038836
288
+ - 0.12327524826514363
289
+ - 0.4418352294043351
290
+ - 0.12490994022681218
291
+ - 0.004662133639412193
292
+ - 0.00460807817987938
293
+ timestamp:
294
+ max:
295
+ - 25.2
296
+ mean:
297
+ - 7.007510548523206
298
+ min:
299
+ - 0.0
300
+ q01: null
301
+ q99: null
302
+ std:
303
+ - 4.457129586378845
304
+ datasets:
305
+ action_key: action
306
+ action_window_size: 10
307
+ data_root_path:
308
+ - datasets/libero_10_lerobot
309
+ - datasets/libero_goal_no_noops_lerobotv2.1
310
+ - datasets/libero_spatial_lerobot
311
+ - datasets/libero_object_lerobot
312
+ statistic_name: libero_10_no_noops
313
+ transforms:
314
+ - embodiment_id: 2
315
+ name_mappings:
316
+ actions:
317
+ - actions
318
+ observation.state:
319
+ - states
320
+ parquet_keys:
321
+ - observation.state
322
+ - timestamp
323
+ - actions
324
+ - info
325
+ - stats
326
+ - action_masks
327
+ type: ProcessParquetInputs
328
+ video_keys:
329
+ - observation.images.image
330
+ - observation.images.wrist_image
331
+ - type: ParquetPrompter
332
+ - tokenizer:
333
+ model_path: ./checkpoints/LimX-Qwen3-VL-0.6B-stage1-20260223/
334
+ type: PretrainedTokenizer
335
+ type: ProcessPrompts
336
+ - height: 224
337
+ type: ResizeImages
338
+ width: 224
339
+ - image_mean:
340
+ - 0.48145466
341
+ - 0.4578275
342
+ - 0.40821073
343
+ image_std:
344
+ - 0.26862954
345
+ - 0.26130258
346
+ - 0.27577711
347
+ max_pixels: 1003520
348
+ merge_size: 2
349
+ min_pixels: 3136
350
+ patch_size: 16
351
+ temporal_patch_size: 2
352
+ type: QWen2VLImageTransform
353
+ - action_dim: 32
354
+ action_key: action
355
+ norm_type: mean_std
356
+ state_dim: 64
357
+ state_key: proprio
358
+ type: NormalizeStatesAndActions
359
+ type: ParquetDataset
360
+ use_delta: false
361
+ window_start_idx: 0
362
+ name_mappings:
363
+ action:
364
+ - action
365
+ observation.state:
366
+ - proprio
367
+ statistic_keys:
368
+ - observation.state
369
+ - timestamp
370
+ - action
371
+ statistic_name: libero_10_no_noops
372
+ type: DistributedRepeatingDataset
373
+ per_device_batch_size: 8
374
+ per_device_num_workers: 4
gr00t_qwen3vl_0.6b_libero/dataset_statistics.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_10_no_noops": {
3
+ "proprio": {
4
+ "mean": [
5
+ -0.0419132679050224,
6
+ 0.034591788297521735,
7
+ 0.8265881844959498,
8
+ 2.90259518190321,
9
+ -0.5570652600832564,
10
+ -0.16592166873533284,
11
+ 0.02845031351083622,
12
+ -0.02880236273799356
13
+ ],
14
+ "std": [
15
+ 0.03756502182067285,
16
+ 0.05091765880150317,
17
+ 0.09107525593038836,
18
+ 0.12327524826514363,
19
+ 0.4418352294043351,
20
+ 0.12490994022681218,
21
+ 0.004662133639412193,
22
+ 0.00460807817987938
23
+ ],
24
+ "min": [
25
+ -0.48278069496154785,
26
+ -0.3309336006641388,
27
+ 0.44550687074661255,
28
+ 1.1323540210723877,
29
+ -3.6312508583068848,
30
+ -1.842738389968872,
31
+ -0.005453015677630901,
32
+ -0.04112039878964424
33
+ ],
34
+ "max": [
35
+ 0.2103137969970703,
36
+ 0.38887521624565125,
37
+ 1.333192229270935,
38
+ 3.7248642444610596,
39
+ 3.5618896484375,
40
+ 1.3863215446472168,
41
+ 0.041575800627470016,
42
+ 0.0013126095291227102
43
+ ],
44
+ "q01": [
45
+ -0.1855636807291125,
46
+ -0.16145669766439186,
47
+ 0.7064185725262808,
48
+ 2.5678211534702324,
49
+ -1.2430377303522737,
50
+ -0.5195810482339626,
51
+ 0.01022917473133343,
52
+ -0.03999379658232052
53
+ ],
54
+ "q99": [
55
+ 0.05938728483051665,
56
+ 0.2361478409238694,
57
+ 0.9397258571145816,
58
+ 3.2118708728143526,
59
+ 0.49082919816100534,
60
+ 0.2100883989120329,
61
+ 0.040047131839991014,
62
+ -0.011104049991952391
63
+ ]
64
+ },
65
+ "timestamp": {
66
+ "mean": [
67
+ 7.007510548523206
68
+ ],
69
+ "std": [
70
+ 4.457129586378845
71
+ ],
72
+ "min": [
73
+ 0.0
74
+ ],
75
+ "max": [
76
+ 25.2
77
+ ],
78
+ "q01": null,
79
+ "q99": null
80
+ },
81
+ "action": {
82
+ "mean": [
83
+ 0.01905656634877842,
84
+ 0.05672475971568838,
85
+ -0.056239289430234256,
86
+ 0.004756678478841528,
87
+ 0.002797492338491304,
88
+ -0.00714607048416358,
89
+ 0.54599156235075
90
+ ],
91
+ "std": [
92
+ 0.10588348353857541,
93
+ 0.13552477199270377,
94
+ 0.13886650724555177,
95
+ 0.01433739270759898,
96
+ 0.02038583948325967,
97
+ 0.033299202425577934,
98
+ 0.1881810653484855
99
+ ],
100
+ "min": [
101
+ -0.9375,
102
+ -0.9375,
103
+ -0.9375,
104
+ -0.23642857372760773,
105
+ -0.3053571283817291,
106
+ -0.3642857074737549,
107
+ 0.0
108
+ ],
109
+ "max": [
110
+ 0.9375,
111
+ 0.9375,
112
+ 0.9375,
113
+ 0.32892856001853943,
114
+ 0.36964285373687744,
115
+ 0.375,
116
+ 1.0
117
+ ],
118
+ "q01": [
119
+ -0.4997477764535965,
120
+ -0.6992653512084763,
121
+ -0.6543309163615124,
122
+ -0.07417070079989778,
123
+ -0.11898748445770971,
124
+ -0.15976085962510805,
125
+ 0.0
126
+ ],
127
+ "q99": [
128
+ 0.658747846713789,
129
+ 0.7333480638990948,
130
+ 0.768601965587579,
131
+ 0.09784501244893279,
132
+ 0.12943469061349036,
133
+ 0.15137893471596325,
134
+ 1.0
135
+ ]
136
+ }
137
+ }
138
+ }
gr00t_qwen3vl_0.6b_libero/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<action_end>": 151675,
6
+ "<action_pad>": 151673,
7
+ "<action_start>": 151674,
8
+ "<action_video>": 151669,
9
+ "<box2d_end>": 151679,
10
+ "<box2d_start>": 151678,
11
+ "<future_video_pad>": 151670,
12
+ "<future_vision_end>": 151672,
13
+ "<future_vision_start>": 151671,
14
+ "<ignore_pad>": 151688,
15
+ "<point2d_end>": 151681,
16
+ "<point2d_start>": 151680,
17
+ "<ref_end>": 151683,
18
+ "<ref_keypoint_end>": 151685,
19
+ "<ref_keypoint_start>": 151684,
20
+ "<ref_start>": 151682,
21
+ "<think>": 151667,
22
+ "<think_end>": 151677,
23
+ "<think_start>": 151676,
24
+ "<tool_call>": 151657,
25
+ "<tool_response>": 151665,
26
+ "<traj2d_end>": 151687,
27
+ "<traj2d_start>": 151686,
28
+ "<|box_end|>": 151649,
29
+ "<|box_start|>": 151648,
30
+ "<|endoftext|>": 151643,
31
+ "<|file_sep|>": 151664,
32
+ "<|fim_middle|>": 151660,
33
+ "<|fim_pad|>": 151662,
34
+ "<|fim_prefix|>": 151659,
35
+ "<|fim_suffix|>": 151661,
36
+ "<|im_end|>": 151645,
37
+ "<|im_start|>": 151644,
38
+ "<|image_pad|>": 151655,
39
+ "<|object_ref_end|>": 151647,
40
+ "<|object_ref_start|>": 151646,
41
+ "<|quad_end|>": 151651,
42
+ "<|quad_start|>": 151650,
43
+ "<|repo_name|>": 151663,
44
+ "<|video_pad|>": 151656,
45
+ "<|vision_end|>": 151653,
46
+ "<|vision_pad|>": 151654,
47
+ "<|vision_start|>": 151652
48
+ }
gr00t_qwen3vl_0.6b_libero/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_qwen3vl_0.6b_libero/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
gr00t_qwen3vl_0.6b_libero/tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7a88bc82340dd4205b97b9c287df826cf386b31ac9ecd9e648073d940355e1
3
+ size 11426476
gr00t_qwen3vl_0.6b_libero/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<action_video>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<future_video_pad>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<future_vision_start>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<future_vision_end>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<action_pad>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<action_start>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<action_end>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<think_start>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<think_end>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<box2d_start>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<box2d_end>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<point2d_start>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<point2d_end>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<ref_start>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<ref_end>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<ref_keypoint_start>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<ref_keypoint_end>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<traj2d_start>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<traj2d_end>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<ignore_pad>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ }
373
+ },
374
+ "additional_special_tokens": [
375
+ "<|im_start|>",
376
+ "<|im_end|>",
377
+ "<|object_ref_start|>",
378
+ "<|object_ref_end|>",
379
+ "<|box_start|>",
380
+ "<|box_end|>",
381
+ "<|quad_start|>",
382
+ "<|quad_end|>",
383
+ "<|vision_start|>",
384
+ "<|vision_end|>",
385
+ "<|vision_pad|>",
386
+ "<|image_pad|>",
387
+ "<|video_pad|>"
388
+ ],
389
+ "bos_token": null,
390
+ "clean_up_tokenization_spaces": false,
391
+ "eos_token": "<|im_end|>",
392
+ "errors": "replace",
393
+ "extra_special_tokens": {},
394
+ "model_max_length": 32768,
395
+ "pad_token": "<|endoftext|>",
396
+ "padding_side": "right",
397
+ "processor_class": "Qwen3VLAProcessor",
398
+ "split_special_tokens": false,
399
+ "tokenizer_class": "Qwen2Tokenizer",
400
+ "unk_token": null
401
+ }
gr00t_qwen3vl_0.6b_libero/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_qwen3vl_0.6b_libero/vlm_backbone_config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_pred_2d_waypoint_loss_weight": 1.0,
3
+ "action_pred_2d_waypoint_max_dim": 4,
4
+ "action_pred_action_head_max_num_embodiments": 64,
5
+ "action_pred_arm_dim": 12,
6
+ "action_pred_binary_flags_loss_weight": 1.0,
7
+ "action_pred_binary_flags_max_dim": 2,
8
+ "action_pred_diffusion_loss_weight": 1.0,
9
+ "action_pred_diffusion_transformer_gradient_weight": 1.0,
10
+ "action_pred_enable": false,
11
+ "action_pred_extra_camera_num": 2,
12
+ "action_pred_gripper_dim": 2,
13
+ "action_pred_gripper_loss_weight": 15.0,
14
+ "action_pred_hidden_size": 256,
15
+ "action_pred_loss_weight": 10.0,
16
+ "action_pred_max_cot_input_token": 50,
17
+ "action_pred_max_input_arm_state_dim": 32,
18
+ "action_pred_max_input_gripper_state_dim": 32,
19
+ "action_pred_model_type": "GRTransformer",
20
+ "action_pred_num_attention_heads": 8,
21
+ "action_pred_num_hidden_layers": 8,
22
+ "action_pred_objective": "continuous",
23
+ "action_pred_text_cot_gradient_weight": 1.0,
24
+ "action_pred_traj_length": 10,
25
+ "action_pred_tune_action_head_backbone": true,
26
+ "action_pred_tune_action_head_projector": true,
27
+ "action_pred_vision_cot_gradient_weight": 1.0,
28
+ "architectures": [
29
+ "Qwen3VLAForConditionalGeneration"
30
+ ],
31
+ "depth_pred_ds_ratio": 8,
32
+ "depth_pred_enable": false,
33
+ "depth_pred_head_type": "Unet",
34
+ "depth_pred_loss_type": "relative",
35
+ "depth_pred_loss_weight": 1.0,
36
+ "dtype": "bfloat16",
37
+ "eos_token_id": 151645,
38
+ "finetune_ignore_mismatched_sizes": false,
39
+ "image_token_id": 151655,
40
+ "mm_action_decoder_type": null,
41
+ "mm_spatial_pool_mode": "bilinear",
42
+ "mm_tunable_parts": "mm_vision_adapter,mm_language_model",
43
+ "model_type": "qwen3_vl",
44
+ "pad_token_id": 151643,
45
+ "pos_skipping_range": 4096,
46
+ "text_config": {
47
+ "attention_bias": false,
48
+ "attention_dropout": 0.0,
49
+ "bos_token_id": 151643,
50
+ "dtype": "bfloat16",
51
+ "eos_token_id": 151645,
52
+ "head_dim": 128,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 1024,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "max_position_embeddings": 262144,
58
+ "model_type": "qwen3_vl_text",
59
+ "num_attention_heads": 16,
60
+ "num_hidden_layers": 28,
61
+ "num_key_value_heads": 8,
62
+ "pad_token_id": null,
63
+ "rms_norm_eps": 1e-06,
64
+ "rope_parameters": {
65
+ "mrope_interleaved": true,
66
+ "mrope_section": [
67
+ 24,
68
+ 20,
69
+ 20
70
+ ],
71
+ "rope_theta": 5000000,
72
+ "rope_type": "default"
73
+ },
74
+ "tie_word_embeddings": true,
75
+ "use_cache": true,
76
+ "vocab_size": 151936
77
+ },
78
+ "tie_word_embeddings": true,
79
+ "transformers_version": "5.3.0.dev0",
80
+ "use_another_LLM_path": "",
81
+ "use_pos_skipping": false,
82
+ "video_pred_enable": false,
83
+ "video_pred_head_type": "mtp",
84
+ "video_pred_loss_weight": 0.5,
85
+ "video_pred_max_frame_length": 10,
86
+ "video_pred_max_mtp_depth": 10,
87
+ "video_pred_mtp_depth_num": 3,
88
+ "video_token_id": 151656,
89
+ "vision_config": {
90
+ "deepstack_visual_indexes": [
91
+ 5,
92
+ 11,
93
+ 17
94
+ ],
95
+ "depth": 24,
96
+ "dtype": "bfloat16",
97
+ "hidden_act": "gelu_pytorch_tanh",
98
+ "hidden_size": 1024,
99
+ "in_channels": 3,
100
+ "initializer_range": 0.02,
101
+ "intermediate_size": 4096,
102
+ "model_type": "qwen3_vl",
103
+ "num_heads": 16,
104
+ "num_position_embeddings": 2304,
105
+ "out_hidden_size": 1024,
106
+ "patch_size": 16,
107
+ "spatial_merge_size": 2,
108
+ "temporal_patch_size": 2
109
+ }
110
+ }