Robotics
LeRobot
Safetensors
act
diffusion
imitation-learning
behavior-cloning
aloha
pytorch_model_hub_mixin
model_hub_mixin
Instructions to use JHeisler/aloha_solo_left_act_diffusion with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LeRobot
How to use JHeisler/aloha_solo_left_act_diffusion with LeRobot:
- Notebooks
- Google Colab
- Kaggle
| # @package _global_ | |
| # REFERENCE COPY — canonical is in this workstream's local lerobot clone at: | |
| # ./lerobot/lerobot/configs/policy/act_diffusion_aloha_solo_real.yaml | |
| # Single-arm (LEFT) ALOHA — Hybrid ACT+Diffusion policy. | |
| # ACT encoder (ResNet18 + transformer) → DDIM diffusion U-Net → action chunks. | |
| # 2 cameras: cam_left_wrist + cam_high. state_dim=action_dim=9. | |
| # DOE winner: batch=24, lr=3e-5 (2026-04-20) | |
| seed: 1000 | |
| dataset_repo_id: JHeisler/aloha_solo_left_4_6_26 | |
| override_dataset_stats: | |
| observation.images.cam_left_wrist: | |
| mean: [[[0.485]], [[0.456]], [[0.406]]] | |
| std: [[[0.229]], [[0.224]], [[0.225]]] | |
| observation.images.cam_high: | |
| mean: [[[0.485]], [[0.456]], [[0.406]]] | |
| std: [[[0.229]], [[0.224]], [[0.225]]] | |
| use_amp: true | |
| use_torch_compile: true | |
| training: | |
| offline_steps: 13400 | |
| online_steps: 0 | |
| eval_freq: -1 | |
| save_freq: 5000 | |
| log_freq: 100 | |
| save_checkpoint: true | |
| batch_size: 24 | |
| lr: 3e-5 | |
| lr_backbone: 3e-5 | |
| lr_warmup_steps: 500 | |
| drop_n_last_frames: 2 | |
| weight_decay: 1e-4 | |
| grad_clip_norm: 10 | |
| online_steps_between_rollouts: 1 | |
| delta_timestamps: | |
| action: "[i / ${fps} for i in range(${policy.chunk_size})]" | |
| eval: | |
| n_episodes: 50 | |
| batch_size: 50 | |
| policy: | |
| name: hybrid_act_diffusion | |
| n_obs_steps: 1 | |
| chunk_size: 100 | |
| n_action_steps: 100 | |
| input_shapes: | |
| observation.images.cam_left_wrist: [3, 480, 640] | |
| observation.images.cam_high: [3, 480, 640] | |
| observation.state: ["${env.state_dim}"] | |
| output_shapes: | |
| action: ["${env.action_dim}"] | |
| input_normalization_modes: | |
| observation.images.cam_left_wrist: mean_std | |
| observation.images.cam_high: mean_std | |
| observation.state: mean_std | |
| output_normalization_modes: | |
| action: mean_std | |
| # ACT visual encoder | |
| vision_backbone: resnet18 | |
| pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 | |
| replace_final_stride_with_dilation: false | |
| pre_norm: false | |
| dim_model: 512 | |
| n_heads: 8 | |
| dim_feedforward: 3200 | |
| feedforward_activation: relu | |
| n_encoder_layers: 4 | |
| dropout: 0.1 | |
| # Diffusion U-Net | |
| down_dims: [256, 512] | |
| kernel_size: 5 | |
| n_groups: 8 | |
| diffusion_step_embed_dim: 128 | |
| use_film_scale_modulation: true | |
| # Noise scheduler | |
| noise_scheduler_type: DDPM | |
| num_train_timesteps: 100 | |
| beta_schedule: squaredcos_cap_v2 | |
| beta_start: 0.0001 | |
| beta_end: 0.02 | |
| prediction_type: epsilon | |
| clip_sample: true | |
| clip_sample_range: 1.0 | |
| num_inference_steps: 10 | |
| do_mask_loss_for_padding: true | |