layout:
  in_len: &in_len 4
  out_len: &out_len 1
  in_step: &in_step 1
  out_step: &out_step 1
  in_out_diff: &in_out_diff 18

  img_height: &img_height 128
  img_width: &img_width 128
  data_channels: 1
  layout: "NTHWC"
dataset:
  dataset_name: "sevirlr"
  img_height: *img_height
  img_width: *img_width

  in_len: *in_len
  out_len: *out_len
  in_step: *in_step
  out_step: *out_step
  in_out_diff: *in_out_diff
  seq_len: &seq_len 22

  plot_stride: 1
  interval_real_time: 10
  sample_mode: "sequent"
  stride: 3
  layout: "NTHWC"
  start_date: null
  train_test_split_date: [2019, 6, 1]
  end_date: null
  val_ratio: 0.1
  metrics_mode: "0"
  metrics_list: ['csi', 'pod', 'sucr', 'bias']
  threshold_list: [16, 74, 133, 160, 181, 219]
  aug_mode: "2"
optim:
  total_batch_size: 128
  micro_batch_size: 128
  seed: 0
  float32_matmul_precision: "high"
  method: "adamw"
  lr: 1.0e-3
  wd: 1.0e-2
  betas: [0.9, 0.999]
  gradient_clip_val: 1.0
  max_epochs: 1000
  loss_type: "l2"
  # scheduler
  warmup_percentage: 0.1
  lr_scheduler_mode: "cosine"
  min_lr_ratio: 1.0e-3
  warmup_min_lr_ratio: 0.1
  plateau_patience: 10
  # early stopping
  monitor: "val_loss_epoch"
  early_stop: true
  early_stop_mode: "min"
  early_stop_patience: 100
  save_top_k: 3
logging:
  logging_name: "alignment_weird_file_test"
  run_id: null
  logging_prefix: "SEVIR-LR_AvgX"
  monitor_lr: true
  monitor_device: false
  track_grad_norm: -1
  use_wandb: true
  profiler: null
trainer:
  check_val_every_n_epoch: 3
  log_step_ratio: 0.001
  precision: 32
  find_unused_parameters: false
  num_sanity_val_steps: 2
eval:
  train_example_data_idx_list: []
  val_example_data_idx_list: []
  test_example_data_idx_list: []
  eval_example_only: false
  num_samples_per_context: 1
  save_gif: false
  gif_fps: 2.0
model:
  diffusion:
    timesteps: 1000
    beta_schedule: "linear"
    linear_start: 1e-4
    linear_end: 2e-2
    cosine_s: 8e-3
    given_betas: null
    # latent diffusion
    cond_stage_model: "__is_first_stage__"
    num_timesteps_cond: null
    cond_stage_trainable: false
    cond_stage_forward: null
    scale_by_std: false
    scale_factor: 1.0
  align:
    alignment_type: "avg_x"
    model_type: "cuboid"
    model_args:
      input_shape: [*out_len, 16, 16, 64 ]
      out_channels: 1
      base_units: 128
      scale_alpha: 1.0
      depth: [ 1, 1 ]
      downsample: 2
      downsample_type: "patch_merge"
      block_attn_patterns: "axial"
      num_heads: 4
      attn_drop: 0.1
      proj_drop: 0.1
      ffn_drop: 0.1
      ffn_activation: "gelu"
      gated_ffn: false
      norm_layer: "layer_norm"
      use_inter_ffn: true
      hierarchical_pos_embed: false
      pos_embed_type: "t+h+w"
      padding_type: "zeros"
      checkpoint_level: 0
      use_relative_pos: true
      self_attn_use_final_proj: true
      # global vectors
      num_global_vectors: 0
      use_global_vector_ffn: true
      use_global_self_attn: false
      separate_global_qkv: false
      global_dim_ratio: 1
      # initialization
      attn_linear_init_mode: "0"
      ffn_linear_init_mode: "0"
      ffn2_linear_init_mode: "2"
      attn_proj_linear_init_mode: "2"
      conv_init_mode: "0"
      down_linear_init_mode: "0"
      global_proj_linear_init_mode: "2"
      norm_init_mode: "0"
      # timestep embedding for diffusion
      time_embed_channels_mult: 4
      time_embed_use_scale_shift_norm: false
      time_embed_dropout: 0.0
      # readout
      pool: "attention"
      readout_seq: true
      out_len: *out_len
  vae:
    pretrained_ckpt_path: "pretrained_sevirlr_vae_8x8x64_v1_2.pt"
    data_channels: 1
    down_block_types: ['DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D']
    in_channels: 1
    block_out_channels: [128, 256, 512, 512]  # downsample `len(block_out_channels) - 1` times
    act_fn: 'silu'
    latent_channels: 64
    up_block_types: ['UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D']
    norm_num_groups: 32
    layers_per_block: 2
    out_channels: 1