# lightning.pytorch==2.4.0.dev20240728
seed_everything: 123
trainer:
  accelerator: gpu
  strategy:
    class_path: lightning.pytorch.strategies.DeepSpeedStrategy
    init_args:
      accelerator: null
      zero_optimization: true
      stage: 2
      remote_device: null
      offload_optimizer: false
      offload_parameters: true
      offload_params_device: cpu
      nvme_path: /local_nvme
      params_buffer_count: 5
      params_buffer_size: 100000000
      max_in_cpu: 1000000000
      offload_optimizer_device: cpu
      optimizer_buffer_count: 4
      block_size: 1048576
      queue_depth: 8
      single_submit: false
      overlap_events: true
      thread_count: 1
      pin_memory: true
      sub_group_size: 1000000000000
      contiguous_gradients: true
      overlap_comm: true
      allgather_partitions: true
      reduce_scatter: true
      allgather_bucket_size: 200000000
      reduce_bucket_size: 200000000
      zero_allow_untested_optimizer: true
      logging_batch_size_per_gpu: auto
      config: null
      logging_level: 30
      parallel_devices: null
      cluster_environment: null
      loss_scale: 0.0
      initial_scale_power: 16
      loss_scale_window: 1000
      hysteresis: 2
      min_loss_scale: 1
      partition_activations: false
      cpu_checkpointing: false
      contiguous_memory_optimization: false
      synchronize_checkpoint_boundary: false
      load_full_weights: false
      precision_plugin: null
      process_group_backend: null
  devices: 8
  num_nodes: 1
  precision: bf16-true
  logger:
    class_path: lightning.pytorch.loggers.TensorBoardLogger
    init_args:
      save_dir: /media/logs
      name: main
      version: null
      log_graph: false
      default_hp_metric: true
      prefix: ''
      sub_dir: null
      comment: ''
      purge_step: null
      max_queue: 10
      flush_secs: 120
      filename_suffix: ''
  callbacks: null
  fast_dev_run: false
  max_epochs: 2
  min_epochs: null
  max_steps: -1
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: null
  check_val_every_n_epoch: 1
  num_sanity_val_steps: 0
  log_every_n_steps: 1
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  accumulate_grad_batches: 8
  gradient_clip_val: null
  gradient_clip_algorithm: null
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null
model:
  config:
    model_name: Mistral-7B-v0.2
    dtype: bfloat16
    num_thoughts: 2
    thought_length: 8
    lookahead_tokens: 4
    embedding_grad_weights: 100.0
    temperature: 1.0
    do_sample: true
    train_max_length: 120
    offload_cache: false
    top_k: null
    top_p: null
  checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
  weight_decay: 0.001
  warmup_steps: 20
  policy_weight: 1.0
  init_lr: 1.0e-06
  optimizer:
    class_path: torch.optim.AdamW
    init_args:
      lr: 1.0e-06
      betas:
      - 0.9
      - 0.999
      eps: 1.0e-08
      weight_decay: 0.001
      amsgrad: false
      maximize: false
      foreach: null
      capturable: false
      differentiable: false
      fused: null
  scheduler: null
ckpt_path: null
data:
  class_path: src.dataset.OpenWebMathDataModule
  init_args:
    data_path: /media/datasets/openwebmath
    tokenizer:
      class_path: src.dataset.SpecialTokenizer
      init_args:
        checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
    batch_size: 1
    max_seq_length: 120
    num_samples: 2048
    ignore_index: -100
    val_split_fraction: 0.125
    seed: 42
    num_workers: 1