{ "train_micro_batch_size_per_gpu": "auto", "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "reduce_bucket_size": 1e8 }, "fp16": { "enabled": true }, "gradient_accumulation_steps": "auto", "wall_clock_breakdown": false }