| data: |
| name: libero_cosmos_policy |
| type: mg |
| backend: robomimic |
| paths: [] |
| task_suite_name: null |
| observations_keys: |
| - image |
| observation_source_keys: |
| image: agentview_rgb_jpeg |
| action_dim: 7 |
| frame_stack: 1 |
| horizon: 1 |
| temporal_index_mode: clip |
| use_proprio: false |
| proprio_source: robot_states |
| image_chw: |
| - 3 |
| - 224 |
| - 224 |
| image_value_range: zero_to_one |
| image_transport_dtype: uint8 |
| duration_focus: null |
| hf: |
| repo_id: nvidia/LIBERO-Cosmos-Policy |
| repo_type: dataset |
| allow_patterns: |
| - success_only/*_regen/*.hdf5 |
| local_files_only: false |
| action_sequence_targets: |
| enabled: true |
| horizons: |
| - 20 |
| target_key: gt_action_seq_max |
| train: |
| optimizer: |
| type: adamw |
| lr: 0.0001 |
| weight_decay: 1.0e-06 |
| batch_size: 256 |
| num_workers: 4 |
| prefetch_factor: null |
| log_interval: 100 |
| total_steps: 5000 |
| eval_interval: 500 |
| use_aug: true |
| aug_mode: default |
| use_amp: true |
| device_transfer_non_blocking: true |
| seed: 42 |
| scheduler: |
| type: cosine |
| warmup_steps: 500 |
| num_cycles: 0.5 |
| min_lr_scale: 0.0 |
| decoder_dataloader: |
| enabled: false |
| batch_size: 64 |
| num_workers: 4 |
| prefetch_factor: null |
| shuffle: true |
| stages: |
| stage1: true |
| stage2: false |
| stage3: false |
| stage1: |
| batch_size: 256 |
| num_workers: 16 |
| prefetch_factor: 2 |
| log_interval: 100 |
| total_steps: 20000 |
| eval_interval: 500 |
| optimizer: |
| lr: 0.0001 |
| weight_decay: 1.0e-06 |
| stage2: |
| batch_size: 256 |
| num_workers: 4 |
| prefetch_factor: null |
| log_interval: 100 |
| total_steps: 5000 |
| eval_interval: 500 |
| latent_target: auto |
| optimizer: |
| lr: 0.0001 |
| weight_decay: 1.0e-06 |
| action_probe: |
| enabled: true |
| stage3: |
| batch_size: 256 |
| num_workers: 4 |
| prefetch_factor: null |
| log_interval: 100 |
| total_steps: 5000 |
| eval_interval: 500 |
| final_eval_num_episodes: 100 |
| log_eval_video: false |
| optimizer: |
| lr: 0.0001 |
| weight_decay: 1.0e-06 |
| trainable_scope: all |
| non_decoder_lr_scale: 1.0 |
| checkpoint: |
| enabled: true |
| base_dir: null |
| save_interval: 1000 |
| save_last: true |
| latest_only: true |
| load: |
| stage1: |
| mode: none |
| path: null |
| stage2: |
| mode: handoff |
| path: null |
| teacher_path: null |
| stage3: |
| mode: handoff |
| path: null |
| model: |
| idm: |
| type: token_idm |
| action_dim: 128 |
| token_dim: 768 |
| model_dim: 512 |
| latent_dim: 32 |
| num_action_tokens: 4 |
| num_blocks: 4 |
| num_heads: 8 |
| dropout: 0.0 |
| fdm: |
| type: token_fdm_duration |
| action_dim: 128 |
| token_dim: 768 |
| model_dim: 512 |
| num_action_tokens: 4 |
| num_blocks: 4 |
| num_heads: 8 |
| dropout: 0.0 |
| hyperbolic_latent: |
| enabled: false |
| backend: geoopt |
| manifold: poincare |
| curvature: 1.0 |
| learnable_curvature: false |
| fdm_input_mode: logmap0 |
| prelift_mode: none |
| prelift_scale: 1.0 |
| lift_max_norm: 5.0 |
| tangent_max_norm: 5.0 |
| eps: 1.0e-06 |
| encoders: |
| type: dino |
| image_key: image |
| model_id: facebook/dinov2-base |
| input_value_range: zero_to_one |
| freeze_backbone: true |
| drop_cls_token: true |
| output_dim: 768 |
| mean: |
| - 0.485 |
| - 0.456 |
| - 0.406 |
| std: |
| - 0.229 |
| - 0.224 |
| - 0.225 |
| type: dino_lam |
| idm_input: future |
| fdm_target: future |
| pixel_decoders: null |
| latent_action_decoders: null |
| objective: |
| fdm_target: future |
| idm_input: future |
| stage_overrides: {} |
| multiscale: |
| enabled: true |
| consistency: |
| enabled: false |
| weight: 10.0 |
| num_pairs: 4 |
| sample_source: all_horizons |
| teacher_mode: direct_teacher |
| prediction_mode: direct_duration |
| allow_plain_fdm_oneshot: true |
| weight_mode: uniform |
| weights: {} |
| temporal: |
| max_offset: 20 |
| anchor_mode: fixed |
| extra_random_count: 4 |
| branch_order: |
| enabled: false |
| radial_weight: 0.0 |
| local_radial_margin_weight: 0.0 |
| local_radial_margin_alpha: 0.05 |
| branch_weight: 0.0 |
| z0_origin_weight: 0.0 |
| prefix_weight: 0.0 |
| radius_progress_weight: 1.0 |
| radius_progress_mode: offset_margin |
| radius_progress_alpha: 0.02 |
| branch_margin_deg: 10.0 |
| eps: 1.0e-06 |
| latent_plan: |
| enabled: false |
| total_horizon: 20 |
| eval: |
| type: robomimic |
| data_path: [] |
| reset_mode: env_reset |
| num_eval_episodes: 20 |
| max_steps: 500 |
| record_video: true |
| checkpoint_path: null |
| checkpoint_strict: true |
| use_checkpoint_cfg: true |
| video_output_path: null |
| obs_keys: null |
| use_proprio: null |
| use_object: false |
| resize_hw: null |
| image_value_range: null |
| seed: 0 |
| agent: |
| encoders: |
| image: |
| in_channels: 3 |
| output_dim: 512 |
| output_mode: global |
| type: resnet18 |
| pretrained: false |
| proprio: |
| input_dim: 9 |
| hidden_dim: 128 |
| output_dim: 64 |
| type: group |
| modalities: |
| - agentview_image |
| - robot0_eye_in_hand_image |
| proj_dim: 128 |
| policies: |
| decoder: |
| type: mlp |
| hidden_dims: |
| - 256 |
| - 256 |
| hidden_dims: |
| - 512 |
| - 512 |
| - 256 |
| - 64 |
| action_dim: 256 |
| emb_dim: 384 |
| gt_action_dim: 7 |
| type: latent_action |
| probes: |
| enabled: true |
| every: 10 |
| steps_per_call: 1 |
| sequence: |
| enabled: true |
| horizons: |
| - 20 |
| target_key: gt_action_seq_max |
| list: |
| z_to_s_t: |
| name: z_to_s_t |
| enabled: false |
| shuffle: true |
| type: regression |
| input: z_t |
| target: s_t |
| loss: mse |
| lr: 0.001 |
| every: 10 |
| mlp: |
| hidden_dims: |
| - 128 |
| - 64 |
| z_to_s_tp: |
| name: z_to_s_tp |
| enabled: false |
| shuffle: true |
| type: regression |
| input: z_t |
| target: s_tp |
| loss: mse |
| lr: 0.001 |
| every: 10 |
| mlp: |
| hidden_dims: |
| - 128 |
| - 64 |
| z_to_action_t: |
| name: z_to_action_t |
| probe_type: z_to_action |
| enabled: true |
| shuffle: true |
| type: regression |
| input: z_t |
| target: gt_action |
| loss: mse |
| lr: 0.001 |
| every: 1 |
| mlp: |
| hidden_dims: |
| - 128 |
| - 64 |
| z_to_action_seq_h20: |
| name: z_to_action_seq_h20 |
| probe_type: z_to_action_sequence |
| enabled: true |
| shuffle: true |
| type: regression |
| input: z_t |
| target: gt_action_seq_max |
| sequence_horizon: 20 |
| loss: mse |
| lr: 0.001 |
| every: 10 |
| mlp: |
| hidden_dims: |
| - 128 |
| - 64 |
| output_dim: 140 |
| logger: |
| project: latent_action |
| run_name: latent_action_training |
| tags: |
| - debug |
| output_root: outputs/token_fdm_duration/dino_suite_all_k20_extra4 |
|
|