Instructions to use junjin0/Multi-view-VLA with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use junjin0/Multi-view-VLA with PEFT:
Task type is invalid.
- Notebooks
- Google Colab
- Kaggle
File size: 3,722 Bytes
12f4088 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | run_id: 0428_liberoall
run_root_dir: ./checkpoints
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: junjin
wandb_project: 0428_liberoall
is_debug: false
framework:
name: QwenAML
qwenvl:
base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action
attn_implementation: flash_attention_2
vl_hidden_dim: 2048
dino:
dino_backbone: dinov2_vits14
action_model:
action_model_type: DiT-B
action_hidden_dim: 1024
hidden_size: 1024
add_pos_embed: true
max_seq_len: 1024
action_dim: 14
state_dim: 14
future_action_window_size: 9
action_horizon: 10
past_action_window_size: 0
repeated_diffusion_steps: 8
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
num_inference_timesteps: 4
num_target_vision_tokens: 32
use_state: false
diffusion_model_cfg:
cross_attention_dim: 2048
dropout: 0.2
final_dropout: true
interleave_self_attention: true
norm_type: ada_norm
num_layers: 16
output_dim: 1024
positional_embeddings: null
spatial_model:
model_name_or_path: ./checkpoints/vggt
output_dim: 2048
spatial_projector:
hidden_dim: 2048
output_dim: 2560
fuser:
type: cross_attention
reduce_in_full_precision: true
use_mv_images: false
layer_qformer:
num_layers: 4
num_query_tokens: 128
input_dim: 2560
ouptput_dim: 2560
image_edit_model:
model_name_or_path: ./checkpoints/LongCat-Image-Edit
lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora
view_num: 2
fuser_type: mlp_gated_tranformer
read_from_local: true
num_inference_steps: 8
datasets:
vlm_data:
dataset_py: vlm_datasets
dataformat: llava_json
dataset_use: sharegpt4v_coco
eval_dataset: sharegpt4v_coco
data_flatten: false
base_interval: 2
max_pixels: 307200
min_pixels: 784
model_max_length: 2048
model_type: qwen2.5vl
per_device_batch_size: 4
vla_data:
num_workers: 4
dataset_py: lerobot_datasets
data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero
mv_data_root_dir: ./dataset/libero_mv_feats
data_mix: libero_all_ration
action_type: delta_qpos
CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
Locate their bounding boxes in [x1,y1,x2,y2] format.
CoT_answer: bbox
default_image_resolution:
- 3
- 224
- 224
per_device_batch_size: 16
load_all_data_for_training: true
obs:
- image_0
video_backend: torchvision_av
trainer:
epochs: 100
max_train_steps: 40000
num_warmup_steps: 5000
save_interval: 5000
eval_interval: 1000000
learning_rate:
base: 2.5e-05
qwen_vl_interface: 1.0e-05
action_model: 0.0001
lr_scheduler_type: cosine_with_min_lr
scheduler_specific_kwargs:
min_lr: 1.0e-06
freeze_modules: spatial_model,image_edit_model
loss_scale:
vla: 1.0
vlm: 0.1
forcing: 0.2
max_grad_norm: 1.0
warmup_ratio: 0.1
weight_decay: 0.0
logging_frequency: 100
gradient_clipping: 1.0
gradient_accumulation_steps: 1
optimizer:
name: AdamW
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 1.0e-08
is_resume: false
resume_from_checkpoint: null
pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt
reload_modules: qwen_vl_interface,action_model
resume_epoch: null
resume_step: null
enable_gradient_checkpointing: true
enable_mixed_precision_training: true
vla_data:
video_backend: torchvision_av
output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration
|