# lightning.pytorch==2.4.0.dev20240728 seed_everything: 123 trainer: accelerator: gpu strategy: class_path: lightning.pytorch.strategies.DeepSpeedStrategy init_args: accelerator: null zero_optimization: true stage: 2 remote_device: null offload_optimizer: false offload_parameters: true offload_params_device: cpu nvme_path: /local_nvme params_buffer_count: 5 params_buffer_size: 100000000 max_in_cpu: 1000000000 offload_optimizer_device: cpu optimizer_buffer_count: 4 block_size: 1048576 queue_depth: 8 single_submit: false overlap_events: true thread_count: 1 pin_memory: true sub_group_size: 1000000000000 contiguous_gradients: true overlap_comm: true allgather_partitions: true reduce_scatter: true allgather_bucket_size: 200000000 reduce_bucket_size: 200000000 zero_allow_untested_optimizer: true logging_batch_size_per_gpu: auto config: null logging_level: 30 parallel_devices: null cluster_environment: null loss_scale: 0.0 initial_scale_power: 16 loss_scale_window: 1000 hysteresis: 2 min_loss_scale: 1 partition_activations: false cpu_checkpointing: false contiguous_memory_optimization: false synchronize_checkpoint_boundary: false load_full_weights: false precision_plugin: null process_group_backend: null devices: 8 num_nodes: 1 precision: bf16-true logger: class_path: lightning.pytorch.loggers.TensorBoardLogger init_args: save_dir: /media/logs name: main version: null log_graph: false default_hp_metric: true prefix: '' sub_dir: null comment: '' purge_step: null max_queue: 10 flush_secs: 120 filename_suffix: '' callbacks: null fast_dev_run: false max_epochs: 2 min_epochs: null max_steps: -1 min_steps: null max_time: null limit_train_batches: null limit_val_batches: null limit_test_batches: null limit_predict_batches: null overfit_batches: 0.0 val_check_interval: null check_val_every_n_epoch: 1 num_sanity_val_steps: 0 log_every_n_steps: 1 enable_checkpointing: null enable_progress_bar: null enable_model_summary: null accumulate_grad_batches: 8 gradient_clip_val: null gradient_clip_algorithm: null deterministic: null benchmark: null inference_mode: true use_distributed_sampler: true profiler: null detect_anomaly: false barebones: false plugins: null sync_batchnorm: false reload_dataloaders_every_n_epochs: 0 default_root_dir: null model: config: model_name: Mistral-7B-v0.2 dtype: bfloat16 num_thoughts: 2 thought_length: 8 lookahead_tokens: 4 embedding_grad_weights: 100.0 temperature: 1.0 do_sample: true train_max_length: 120 offload_cache: false top_k: null top_p: null checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 weight_decay: 0.001 warmup_steps: 20 policy_weight: 1.0 init_lr: 1.0e-06 optimizer: class_path: torch.optim.AdamW init_args: lr: 1.0e-06 betas: - 0.9 - 0.999 eps: 1.0e-08 weight_decay: 0.001 amsgrad: false maximize: false foreach: null capturable: false differentiable: false fused: null scheduler: null ckpt_path: null data: class_path: src.dataset.OpenWebMathDataModule init_args: data_path: /media/datasets/openwebmath tokenizer: class_path: src.dataset.SpecialTokenizer init_args: checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 batch_size: 1 max_seq_length: 120 num_samples: 2048 ignore_index: -100 val_split_fraction: 0.125 seed: 42 num_workers: 1