mode: pt | |
device: gpu | |
precision: bf16 | |
eval_only: false | |
predict_only: false | |
seed: 34534 | |
model: | |
klass: hf_t5 | |
name: pszemraj/tFINE-900m-e16-d32 | |
overwrite: | |
dropout_rate: 0.0 | |
checkpoint_path: '' | |
random_init: false | |
compile: true | |
tokenizer: | |
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5 | |
data: | |
input_length: 1024 | |
mlm_probability: 0.15 | |
mean_noise_span_length: 3.0 | |
num_workers: 16 | |
optim: | |
name: adamwscale | |
base_lr: 0.01 | |
batch_size: 128 | |
total_steps: 20000 | |
epochs: -1 | |
warmup_steps: 5000 | |
lr_scheduler: cosine | |
weight_decay: 0.0001 | |
grad_clip: 1.0 | |
grad_acc: 8 | |
final_cosine: 2.0e-05 | |
eval: | |
every_steps: 1000000000 | |
steps: 500 | |
checkpoint: | |
every_steps: 2500 | |
logging: | |
use_wandb: true | |
wandb_config: | |
project: nanoT5 | |
entity: pszemraj | |
tags: | |
- 900m | |
- '1024' | |
mode: online | |
every_steps: 25 | |
grad_l2: true | |
weights_l2: true | |