VLA-JEPA / SimplerEnv /config.yaml
ginwind's picture
update
9cea7bd verified
run_id: SimplerEnv
run_root_dir: checkpoints
seed: 42
trackers:
- json
is_debug: false
framework:
name: VLA_JEPA
qwenvl:
base_vlm: /home/dataset-local/models/Qwen3-VL-2B-Instruct
attn_implementation: flash_attention_2
vl_hidden_dim: 2048
action_model:
action_model_type: DiT-B
action_hidden_dim: 1024
hidden_size: 1024
add_pos_embed: true
max_seq_len: 1024
action_dim: 7
state_dim: 8
future_action_window_size: 6
action_horizon: 7
past_action_window_size: 0
repeated_diffusion_steps: 8
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
num_inference_timesteps: 4
num_target_vision_tokens: 32
diffusion_model_cfg:
cross_attention_dim: 2048
dropout: 0.2
final_dropout: true
interleave_self_attention: true
norm_type: ada_norm
num_layers: 16
output_dim: 1024
positional_embeddings: null
vj2_model:
base_encoder: /home/dataset-local/models/vjepa2-vitl-fpc64-256
depth: 12
num_heads: 8
special_action_token: <|action_{}|>
num_action_tokens_per_timestep: 8
embodied_action_token: <|embodied_action|>
num_embodied_action_tokens_per_instruction: 32
num_frames: 8
reduce_in_full_precision: true
datasets:
vla_data:
dataset_py: lerobot_datasets
data_root_dir: /home/dataset-local/datasets/LeRobot/OXE_LEROBOT_DATASET
data_mix: bridge_rt_1
action_type: delta_ee
CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
{actions} and produce the corresponding policy actions {e_actions}.
resolution_size: 224
video_resolution_size: 256
per_device_batch_size: 32
load_all_data_for_training: true
action_horizon: 7
with_state: false
trainer:
epochs: 100
max_train_steps: 30000
num_warmup_steps: 5000
save_interval: 10000
eval_interval: 100
learning_rate:
base: 3.0e-05
qwen_vl_interface: 1.0e-05
action_model: 0.0001
vj_predictor: 0.0005
lr_scheduler_type: cosine_with_min_lr
scheduler_specific_kwargs:
min_lr: 1.0e-05
freeze_modules: ''
loss_scale:
vla: 1.0
vlm: 0.1
max_grad_norm: 1.0
warmup_ratio: 0.1
weight_decay: 0.0
logging_frequency: 10
gradient_clipping: 1.0
gradient_accumulation_steps: 1
optimizer:
name: AdamW
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 1.0e-08
is_resume: false
resume_epoch: null
resume_step: null
enable_gradient_checkpointing: true
enable_mixed_precision_training: true
output_dir: checkpoints/SimplerEnv