Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

libero_10_2B/starvla_qwen_dual/config.yaml +111 -0
libero_goal_2B/starvla_qwen_dual/config.yaml +111 -0
libero_object_2B/starvla_qwen_dual/config.yaml +111 -0
libero_spatial_2B/starvla_qwen_dual/config.yaml +111 -0

libero_10_2B/starvla_qwen_dual/config.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+run_id: starvla_qwen_dual
+run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_10_2B
+seed: 42
+trackers:
+- jsonl
+- wandb
+wandb_project: starVLA
+is_debug: false
+enable_mee: false
+mee_weight: 0.01
+framework:
+  name: Qwen-Dual
+  qwenvl:
+    base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 2
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 7
+    future_action_window_size: 7
+    action_horizon: 8
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  reduce_in_full_precision: true
+datasets:
+  vlm_data:
+    dataset_py: vlm_datasets
+    dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
+    eval_dataset: aokvqa_cauldron_llava_format
+    data_flatten: false
+    base_interval: 2
+    max_pixels: 12845056
+    min_pixels: 3136
+    model_max_length: 2048
+    model_type: qwen2.5vl
+    per_device_batch_size: 4
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /share/project/baishuanghao/data/libero_lerobot
+    data_mix: libero_10
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
+      Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution:
+    - 3
+    - 224
+    - 224
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs:
+    - image_0
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 60000
+  eval_interval: 30000
+  learning_rate:
+    base: 4.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: null
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 100
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_10_2B/starvla_qwen_dual

libero_goal_2B/starvla_qwen_dual/config.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+run_id: starvla_qwen_dual
+run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_goal_2B
+seed: 42
+trackers:
+- jsonl
+- wandb
+wandb_project: starVLA
+is_debug: false
+enable_mee: false
+mee_weight: 0.01
+framework:
+  name: Qwen-Dual
+  qwenvl:
+    base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 2
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 7
+    future_action_window_size: 7
+    action_horizon: 8
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  reduce_in_full_precision: true
+datasets:
+  vlm_data:
+    dataset_py: vlm_datasets
+    dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
+    eval_dataset: aokvqa_cauldron_llava_format
+    data_flatten: false
+    base_interval: 2
+    max_pixels: 12845056
+    min_pixels: 3136
+    model_max_length: 2048
+    model_type: qwen2.5vl
+    per_device_batch_size: 4
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /share/project/baishuanghao/data/libero_lerobot
+    data_mix: libero_goal
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
+      Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution:
+    - 3
+    - 224
+    - 224
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs:
+    - image_0
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 60000
+  eval_interval: 30000
+  learning_rate:
+    base: 4.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: null
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 100
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_goal_2B/starvla_qwen_dual

libero_object_2B/starvla_qwen_dual/config.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+run_id: starvla_qwen_dual
+run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_object_2B
+seed: 42
+trackers:
+- jsonl
+- wandb
+wandb_project: starVLA
+is_debug: false
+enable_mee: false
+mee_weight: 0.01
+framework:
+  name: Qwen-Dual
+  qwenvl:
+    base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 2
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 7
+    future_action_window_size: 7
+    action_horizon: 8
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  reduce_in_full_precision: true
+datasets:
+  vlm_data:
+    dataset_py: vlm_datasets
+    dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
+    eval_dataset: aokvqa_cauldron_llava_format
+    data_flatten: false
+    base_interval: 2
+    max_pixels: 12845056
+    min_pixels: 3136
+    model_max_length: 2048
+    model_type: qwen2.5vl
+    per_device_batch_size: 4
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /share/project/baishuanghao/data/libero_lerobot
+    data_mix: libero_object
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
+      Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution:
+    - 3
+    - 224
+    - 224
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs:
+    - image_0
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 60000
+  eval_interval: 30000
+  learning_rate:
+    base: 4.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: null
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 100
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_object_2B/starvla_qwen_dual

libero_spatial_2B/starvla_qwen_dual/config.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+run_id: starvla_qwen_dual
+run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_spatial_2B
+seed: 42
+trackers:
+- jsonl
+- wandb
+wandb_project: starVLA
+is_debug: false
+enable_mee: false
+mee_weight: 0.01
+framework:
+  name: Qwen-Dual
+  qwenvl:
+    base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 2
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 7
+    future_action_window_size: 7
+    action_horizon: 8
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  reduce_in_full_precision: true
+datasets:
+  vlm_data:
+    dataset_py: vlm_datasets
+    dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
+    eval_dataset: aokvqa_cauldron_llava_format
+    data_flatten: false
+    base_interval: 2
+    max_pixels: 12845056
+    min_pixels: 3136
+    model_max_length: 2048
+    model_type: qwen2.5vl
+    per_device_batch_size: 4
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /share/project/baishuanghao/data/libero_lerobot
+    data_mix: libero_spatial
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
+      Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution:
+    - 3
+    - 224
+    - 224
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs:
+    - image_0
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 60000
+  eval_interval: 30000
+  learning_rate:
+    base: 4.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: null
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 100
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_spatial_2B/starvla_qwen_dual