clm_llama32_8K / README.md
mfajcik's picture
Update README.md
26deb1f verified

Model trained in following setup:

variables:
  global_seed: 17
  max_seq_len: 8192
  # Run Name
  run_name: llama32_clm34_Q512xP400_8192

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false


model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: mfajcik/clm_llama32_8K # RECIPE from LLama3.2: first pretrain on 2k context, then 2 epochs on 8k context, then 1 epoch with hard negatives

  use_flash_attention_2: true
  copy_config:
    copy_implementation: v3.4_fulljoint_ss
    copy_token: "|copy|"

  config_overrides:
    span_heads: 256
    span_d: 6144
    # Queries
    K_past_positives: 312 # how many 'positives' to take from tokens with copy position annotated (span gt)
    K_past_negatives: 200 # how many 'negatives' to sample from tokens with without copy position annotation (token gt)
    # Past states - how many candidates to consider for starts/ends for each query (number includes gt if available + negatives)
    K_start: 400
    K_end: 400
    smart_sampling: true
    hn_topk_positions: 800
    reweighting: false
#    sparse_bmm_triton: true

# Tokenizer
tokenizer:
  name: meta-llama/Llama-3.2-3B-Instruct
  kwargs:
    model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
  name: finetuning
  dataset:
    hf_name: mfajcik/WildChat-copyN2_llama31_L8192
    split: train
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: true
    decoder_only_format: true
    shuffle: true
    preprocessing_fn: src.preprocessing.utils:filter_prompt_response
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

eval_loader:
  name: finetuning
  dataset:
    hf_name: mfajcik/WildChat-copyN2_llama31_L8192
    split: validation # change to validation in later experiments
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    shuffle: false
    preprocessing_fn: src.preprocessing.utils:filter_prompt_response

  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

## Optimization
scheduler:
  name: cosine_with_warmup
  t_warmup: 200ba
  alpha_f: 0.1

optimizer:
  name: decoupled_lionw
  lr: 5e-6
  betas:
    - 0.9
    - 0.95
  weight_decay: 0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 2.0

max_duration: 2ep
eval_interval: 100ba


eval_first: false
global_train_batch_size: 128

# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16

# Logging
progress_bar: true
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: { }
  memory_monitor: { }
  runtime_estimator: { }

loggers:
  wandb:
    project: "copylm"  # Replace with your project name
    entity: "ifajcik-brno-university-of-technology"  # Replace with your username or team name
    name: "DBGN_Q512xP400_8192"  # Optional: name of the current experiment

# Checkpoint to local filesystem or remote object store
save_interval: 100ba
autoresume: true
save_num_checkpoints_to_keep: 25  # Important, this cleans up checkpoints saved to DISK

load_weights_only: true

save_folder: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/${variables.run_name}