mode: pt device: gpu precision: bf16 eval_only: false predict_only: false seed: 34534 model: klass: hf_t5 name: pszemraj/tFINE-900m-e16-d32 overwrite: dropout_rate: 0.0 checkpoint_path: '' random_init: false compile: true tokenizer: name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 data: input_length: 1024 mlm_probability: 0.15 mean_noise_span_length: 3.0 num_workers: 16 optim: name: adamwscale base_lr: 0.01 batch_size: 128 total_steps: 20000 epochs: -1 warmup_steps: 5000 lr_scheduler: cosine weight_decay: 0.0001 grad_clip: 1.0 grad_acc: 8 final_cosine: 2.0e-05 eval: every_steps: 1000000000 steps: 500 checkpoint: every_steps: 2500 logging: use_wandb: true wandb_config: project: nanoT5 entity: pszemraj tags: - 900m - '1024' mode: online every_steps: 25 grad_l2: true weights_l2: true