wandb_version: 1 _n_gpu: desc: null value: 1 _name_or_path: desc: null value: dmis-lab/biobert-base-cased-v1.1 _wandb: desc: null value: cli_version: 0.12.2 framework: huggingface huggingface_version: 4.10.0 is_jupyter_run: false is_kaggle_kernel: false m: - 1: train/global_step 6: - 3 - 1: gradients/classifier\.bias._type 5: 1 6: - 1 - 1: gradients/classifier\.bias.values 5: 1 6: - 1 - 1: gradients/classifier\.bias.bins 5: 1 6: - 1 - 1: gradients/classifier\.weight._type 5: 1 6: - 1 - 1: gradients/classifier\.weight.values 5: 1 6: - 1 - 1: gradients/classifier\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.position_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.position_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.position_embeddings\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight.bins 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.word_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.word_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/bert\.embeddings\.word_embeddings\.weight.bins 5: 1 6: - 1 - 1: train/loss 5: 1 6: - 1 - 1: train/learning_rate 5: 1 6: - 1 - 1: train/epoch 5: 1 6: - 1 - 1: train/train_runtime 5: 1 6: - 1 - 1: train/train_samples_per_second 5: 1 6: - 1 - 1: train/train_steps_per_second 5: 1 6: - 1 - 1: train/total_flos 5: 1 6: - 1 - 1: train/train_loss 5: 1 6: - 1 - 1: eval/loss 5: 1 6: - 1 - 1: eval/precision 5: 1 6: - 1 - 1: eval/recall 5: 1 6: - 1 - 1: eval/f1 5: 1 6: - 1 - 1: eval/runtime 5: 1 6: - 1 - 1: eval/samples_per_second 5: 1 6: - 1 - 1: eval/steps_per_second 5: 1 6: - 1 python_version: 3.6.8 start_time: 1631810254 t: 1: - 1 - 2 - 3 - 5 - 11 2: - 1 - 2 - 3 - 5 - 11 3: - 1 - 7 - 13 4: 3.6.8 5: 0.12.2 6: 4.10.0 8: - 5 adafactor: desc: null value: false adam_beta1: desc: null value: 0.9 adam_beta2: desc: null value: 0.999 adam_epsilon: desc: null value: 1.0e-08 add_cross_attention: desc: null value: false architectures: desc: null value: null attention_probs_dropout_prob: desc: null value: 0.1 bad_words_ids: desc: null value: null bos_token_id: desc: null value: null chunk_size_feed_forward: desc: null value: 0 classifier_dropout: desc: null value: null dataloader_drop_last: desc: null value: false dataloader_num_workers: desc: null value: 0 dataloader_pin_memory: desc: null value: true ddp_find_unused_parameters: desc: null value: None debug: desc: null value: '[]' decoder_start_token_id: desc: null value: null deepspeed: desc: null value: None disable_tqdm: desc: null value: false diversity_penalty: desc: null value: 0.0 do_eval: desc: null value: true do_predict: desc: null value: true do_sample: desc: null value: false do_train: desc: null value: true early_stopping: desc: null value: false encoder_no_repeat_ngram_size: desc: null value: 0 eos_token_id: desc: null value: null eval_accumulation_steps: desc: null value: None eval_batch_size: desc: null value: 8 eval_steps: desc: null value: None evaluation_strategy: desc: null value: 'no' finetuning_task: desc: null value: null forced_bos_token_id: desc: null value: null forced_eos_token_id: desc: null value: null fp16: desc: null value: false fp16_backend: desc: null value: auto fp16_full_eval: desc: null value: false fp16_opt_level: desc: null value: O1 gradient_accumulation_steps: desc: null value: 1 gradient_checkpointing: desc: null value: false greater_is_better: desc: null value: None group_by_length: desc: null value: false hidden_act: desc: null value: gelu hidden_dropout_prob: desc: null value: 0.1 hidden_size: desc: null value: 768 id2label: desc: null value: '0': B-EPI '1': B-LOC '2': B-STAT '3': I-EPI '4': I-LOC '5': I-STAT '6': O ignore_data_skip: desc: null value: false initializer_range: desc: null value: 0.02 intermediate_size: desc: null value: 3072 is_decoder: desc: null value: false is_encoder_decoder: desc: null value: false label2id: desc: null value: B-EPI: 0 B-LOC: 1 B-STAT: 2 I-EPI: 3 I-LOC: 4 I-STAT: 5 O: 6 label_names: desc: null value: None label_smoothing_factor: desc: null value: 0.0 layer_norm_eps: desc: null value: 1.0e-12 learning_rate: desc: null value: 5.0e-05 length_column_name: desc: null value: length length_penalty: desc: null value: 1.0 load_best_model_at_end: desc: null value: false local_rank: desc: null value: -1 log_level: desc: null value: -1 log_level_replica: desc: null value: -1 log_on_each_node: desc: null value: true logging_dir: desc: null value: ./resultsV3.2/runs/Sep16_16-37-25_ordr-neo4j-dev-ec2-04 logging_first_step: desc: null value: false logging_steps: desc: null value: 500 logging_strategy: desc: null value: steps lr_scheduler_type: desc: null value: linear max_grad_norm: desc: null value: 1.0 max_length: desc: null value: 20 max_position_embeddings: desc: null value: 512 max_steps: desc: null value: -1 metric_for_best_model: desc: null value: None min_length: desc: null value: 0 model_type: desc: null value: bert mp_parameters: desc: null value: '' no_cuda: desc: null value: false no_repeat_ngram_size: desc: null value: 0 num_attention_heads: desc: null value: 12 num_beam_groups: desc: null value: 1 num_beams: desc: null value: 1 num_hidden_layers: desc: null value: 12 num_return_sequences: desc: null value: 1 num_train_epochs: desc: null value: 30.0 output_attentions: desc: null value: false output_dir: desc: null value: ./resultsV3.2 output_hidden_states: desc: null value: false output_scores: desc: null value: false overwrite_output_dir: desc: null value: true pad_token_id: desc: null value: 0 past_index: desc: null value: -1 per_device_eval_batch_size: desc: null value: 8 per_device_train_batch_size: desc: null value: 16 per_gpu_eval_batch_size: desc: null value: None per_gpu_train_batch_size: desc: null value: None position_embedding_type: desc: null value: absolute prediction_loss_only: desc: null value: false prefix: desc: null value: null problem_type: desc: null value: null pruned_heads: desc: null value: {} push_to_hub: desc: null value: false push_to_hub_model_id: desc: null value: resultsV3.2 push_to_hub_organization: desc: null value: None push_to_hub_token: desc: null value: None remove_invalid_values: desc: null value: false remove_unused_columns: desc: null value: true repetition_penalty: desc: null value: 1.0 report_to: desc: null value: '[''tensorboard'', ''wandb'']' resume_from_checkpoint: desc: null value: None return_dict: desc: null value: true return_dict_in_generate: desc: null value: false run_name: desc: null value: ./resultsV3.2 save_on_each_node: desc: null value: false save_steps: desc: null value: 2500 save_strategy: desc: null value: steps save_total_limit: desc: null value: None seed: desc: null value: 1 sep_token_id: desc: null value: null sharded_ddp: desc: null value: '[]' skip_memory_metrics: desc: null value: true task_specific_params: desc: null value: null temperature: desc: null value: 1.0 tie_encoder_decoder: desc: null value: false tie_word_embeddings: desc: null value: true tokenizer_class: desc: null value: null top_k: desc: null value: 50 top_p: desc: null value: 1.0 torch_dtype: desc: null value: null torchscript: desc: null value: false tpu_metrics_debug: desc: null value: false tpu_num_cores: desc: null value: None train_batch_size: desc: null value: 16 transformers_version: desc: null value: 4.10.0 type_vocab_size: desc: null value: 2 use_bfloat16: desc: null value: false use_cache: desc: null value: true use_legacy_prediction_loop: desc: null value: false vocab_size: desc: null value: 28996 warmup_ratio: desc: null value: 0.0 warmup_steps: desc: null value: 0 weight_decay: desc: null value: 0.05