data: name: libero_cosmos_policy type: mg backend: robomimic paths: [] task_suite_name: null observations_keys: - image observation_source_keys: image: agentview_rgb_jpeg action_dim: 7 frame_stack: 1 horizon: 1 temporal_index_mode: clip use_proprio: false proprio_source: robot_states image_chw: - 3 - 224 - 224 image_value_range: zero_to_one image_transport_dtype: uint8 duration_focus: null hf: repo_id: nvidia/LIBERO-Cosmos-Policy repo_type: dataset allow_patterns: - success_only/*_regen/*.hdf5 local_files_only: false action_sequence_targets: enabled: true horizons: - 20 target_key: gt_action_seq_max train: optimizer: type: adamw lr: 0.0001 weight_decay: 1.0e-06 batch_size: 256 num_workers: 4 prefetch_factor: null log_interval: 100 total_steps: 5000 eval_interval: 500 use_aug: true aug_mode: default use_amp: true device_transfer_non_blocking: true seed: 42 scheduler: type: cosine warmup_steps: 500 num_cycles: 0.5 min_lr_scale: 0.0 decoder_dataloader: enabled: false batch_size: 64 num_workers: 4 prefetch_factor: null shuffle: true stages: stage1: true stage2: false stage3: false stage1: batch_size: 256 num_workers: 16 prefetch_factor: 2 log_interval: 100 total_steps: 20000 eval_interval: 500 optimizer: lr: 0.0001 weight_decay: 1.0e-06 stage2: batch_size: 256 num_workers: 4 prefetch_factor: null log_interval: 100 total_steps: 5000 eval_interval: 500 latent_target: auto optimizer: lr: 0.0001 weight_decay: 1.0e-06 action_probe: enabled: true stage3: batch_size: 256 num_workers: 4 prefetch_factor: null log_interval: 100 total_steps: 5000 eval_interval: 500 final_eval_num_episodes: 100 log_eval_video: false optimizer: lr: 0.0001 weight_decay: 1.0e-06 trainable_scope: all non_decoder_lr_scale: 1.0 checkpoint: enabled: true base_dir: null save_interval: 1000 save_last: true latest_only: true load: stage1: mode: none path: null stage2: mode: handoff path: null teacher_path: null stage3: mode: handoff path: null model: idm: type: token_idm action_dim: 128 token_dim: 768 model_dim: 512 latent_dim: 32 num_action_tokens: 4 num_blocks: 4 num_heads: 8 dropout: 0.0 fdm: type: token_fdm_duration action_dim: 128 token_dim: 768 model_dim: 512 num_action_tokens: 4 num_blocks: 4 num_heads: 8 dropout: 0.0 hyperbolic_latent: enabled: false backend: geoopt manifold: poincare curvature: 1.0 learnable_curvature: false fdm_input_mode: logmap0 prelift_mode: none prelift_scale: 1.0 lift_max_norm: 5.0 tangent_max_norm: 5.0 eps: 1.0e-06 encoders: type: dino image_key: image model_id: facebook/dinov2-base input_value_range: zero_to_one freeze_backbone: true drop_cls_token: true output_dim: 768 mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 type: dino_lam idm_input: future fdm_target: future pixel_decoders: null latent_action_decoders: null objective: fdm_target: future idm_input: future stage_overrides: {} multiscale: enabled: true consistency: enabled: false weight: 10.0 num_pairs: 4 sample_source: all_horizons teacher_mode: direct_teacher prediction_mode: direct_duration allow_plain_fdm_oneshot: true weight_mode: uniform weights: {} temporal: max_offset: 20 anchor_mode: fixed extra_random_count: 4 branch_order: enabled: false radial_weight: 0.0 local_radial_margin_weight: 0.0 local_radial_margin_alpha: 0.05 branch_weight: 0.0 z0_origin_weight: 0.0 prefix_weight: 0.0 radius_progress_weight: 1.0 radius_progress_mode: offset_margin radius_progress_alpha: 0.02 branch_margin_deg: 10.0 eps: 1.0e-06 latent_plan: enabled: false total_horizon: 20 eval: type: robomimic data_path: [] reset_mode: env_reset num_eval_episodes: 20 max_steps: 500 record_video: true checkpoint_path: null checkpoint_strict: true use_checkpoint_cfg: true video_output_path: null obs_keys: null use_proprio: null use_object: false resize_hw: null image_value_range: null seed: 0 agent: encoders: image: in_channels: 3 output_dim: 512 output_mode: global type: resnet18 pretrained: false proprio: input_dim: 9 hidden_dim: 128 output_dim: 64 type: group modalities: - agentview_image - robot0_eye_in_hand_image proj_dim: 128 policies: decoder: type: mlp hidden_dims: - 256 - 256 hidden_dims: - 512 - 512 - 256 - 64 action_dim: 256 emb_dim: 384 gt_action_dim: 7 type: latent_action probes: enabled: true every: 10 steps_per_call: 1 sequence: enabled: true horizons: - 20 target_key: gt_action_seq_max list: z_to_s_t: name: z_to_s_t enabled: false shuffle: true type: regression input: z_t target: s_t loss: mse lr: 0.001 every: 10 mlp: hidden_dims: - 128 - 64 z_to_s_tp: name: z_to_s_tp enabled: false shuffle: true type: regression input: z_t target: s_tp loss: mse lr: 0.001 every: 10 mlp: hidden_dims: - 128 - 64 z_to_action_t: name: z_to_action_t probe_type: z_to_action enabled: true shuffle: true type: regression input: z_t target: gt_action loss: mse lr: 0.001 every: 1 mlp: hidden_dims: - 128 - 64 z_to_action_seq_h20: name: z_to_action_seq_h20 probe_type: z_to_action_sequence enabled: true shuffle: true type: regression input: z_t target: gt_action_seq_max sequence_horizon: 20 loss: mse lr: 0.001 every: 10 mlp: hidden_dims: - 128 - 64 output_dim: 140 logger: project: latent_action run_name: latent_action_training tags: - debug output_root: outputs/token_fdm_duration/dino_suite_all_k20_extra4