File size: 2,466 Bytes
92f6a68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
datasets:
  vla_data:
    CoT_prompt: '{instruction}'
    data_mix: fourier_gr1_unified_1000
    data_root_dir: playground/Datasets/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim
    delete_pause_frame: false
    image_size:
    - 224
    - 224
    per_device_batch_size: 16
    video_backend: pyav
frameskip:
  cache_dir: playground/frameskip/frameskip_cache_vac_cr20-100
  default_compression_ratio: 1.0
  enabled: true
  importance:
    allow_backend_fallback: false
    alpha: 0.6
    beta: 0.2
    device: cuda
    enable_vac: true
    gamma: 0.2
    max_vac_frames: 16
    type: gripper_aware
    vac_beta: 0.2
    video_backend: ffmpeg
    visual_encoder_checkpoint: /mnt/project_ai4edu/share/models/timm/vit_large_patch14_dinov2.lvd142m/pytorch_model.bin
    visual_encoder_name: vit_large_patch14_dinov2
  pruning:
    compression_ratios:
    - 0.2
    - 0.3
    - 0.4
    - 0.5
    - 0.6
    - 0.7
    - 0.8
    - 0.9
    - 1.0
    used_compression_ratios:
    - 0.2
    - 0.2
    - 0.2
    - 0.2
    - 0.2
    - 1.0
  training:
    dynamic_ratio: true
    ratio_schedule: uniform
    warmup_steps: 5000
framework:
  action_model:
    action_dim: 29
    action_horizon: 16
    action_model_type: DiT-B
    add_pos_embed: true
    diffusion_model_cfg:
      cross_attention_dim: 2560
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 2560
      positional_embeddings: null
    future_action_window_size: 15
    hidden_size: 2560
    max_seq_len: 1024
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    num_timestep_buckets: 1000
    past_action_window_size: 0
    state_dim: 58
  name: QwenGR00T
  qwenvl:
    base_vlm: Qwen/Qwen3-VL-4B-Instruct
    template: qwen3_vl
run_root_dir: ./results/Checkpoints/FrameSkip/RoboCasa
seed: 42
trainer:
  eval_interval: 1000
  freeze_modules: true
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  is_resume: true
  learning_rate:
    action_model: 0.0001
    base: 1.0e-05
    qwen_vl_interface: 1.0e-05
  logging_frequency: 100
  lr_scheduler_type: cosine_with_min_lr
  max_train_steps: 100000
  num_warmup_steps: 5000
  optimizer:
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  save_interval: 10000
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
wandb_entity: jinhuiye
wandb_project: starVLA