cabrooks commited on
Commit
c206b78
·
1 Parent(s): a06f792

Upload 7 files

Browse files
Files changed (7) hide show
  1. config.json +28 -0
  2. optimizer.pt +3 -0
  3. pytorch_model.bin +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +3866 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/scratch/gpfs/cabrooks/Greek_PH/out_ph16/checkpoint-805276",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_max_length": 512,
17
+ "model_type": "bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 0,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.18.0",
25
+ "type_vocab_size": 2,
26
+ "use_cache": true,
27
+ "vocab_size": 35000
28
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aa9e38a084c3ece0887d9550b9c64799df1e38089cf4b81a2088994407363c7
3
+ size 903780505
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b08325547fc9975cc2bc5dbb036b25d78831e01bbefd92a0b714ae8db3a7d6
3
+ size 451903147
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e41760d07980a6f6c61dce65798755b98b61111381957b2dd30623f8ddeab413
3
+ size 15523
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:453cc126beea0b9d0dcf6c3c7e3b32f8bcb3ad1809624e5ed8394daa1d986cba
3
+ size 623
trainer_state.json ADDED
@@ -0,0 +1,3866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 110.0,
5
+ "global_step": 962830,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.4,
12
+ "learning_rate": 4.992000456986176e-05,
13
+ "loss": 0.9599,
14
+ "step": 3501
15
+ },
16
+ {
17
+ "epoch": 0.4,
18
+ "eval_loss": 0.9476152062416077,
19
+ "eval_runtime": 178.6207,
20
+ "eval_samples_per_second": 174.224,
21
+ "eval_steps_per_second": 10.889,
22
+ "step": 3501
23
+ },
24
+ {
25
+ "epoch": 0.8,
26
+ "learning_rate": 4.984000913972353e-05,
27
+ "loss": 0.9731,
28
+ "step": 7002
29
+ },
30
+ {
31
+ "epoch": 0.8,
32
+ "eval_loss": 0.9536693692207336,
33
+ "eval_runtime": 178.7725,
34
+ "eval_samples_per_second": 174.076,
35
+ "eval_steps_per_second": 10.88,
36
+ "step": 7002
37
+ },
38
+ {
39
+ "epoch": 1.2,
40
+ "learning_rate": 4.9760013709585284e-05,
41
+ "loss": 0.9745,
42
+ "step": 10503
43
+ },
44
+ {
45
+ "epoch": 1.2,
46
+ "eval_loss": 0.9530463218688965,
47
+ "eval_runtime": 178.7418,
48
+ "eval_samples_per_second": 174.106,
49
+ "eval_steps_per_second": 10.882,
50
+ "step": 10503
51
+ },
52
+ {
53
+ "epoch": 1.6,
54
+ "learning_rate": 4.9680018279447047e-05,
55
+ "loss": 0.9747,
56
+ "step": 14004
57
+ },
58
+ {
59
+ "epoch": 1.6,
60
+ "eval_loss": 0.9569175839424133,
61
+ "eval_runtime": 178.7659,
62
+ "eval_samples_per_second": 174.082,
63
+ "eval_steps_per_second": 10.88,
64
+ "step": 14004
65
+ },
66
+ {
67
+ "epoch": 2.0,
68
+ "learning_rate": 4.960002284930881e-05,
69
+ "loss": 0.978,
70
+ "step": 17505
71
+ },
72
+ {
73
+ "epoch": 2.0,
74
+ "eval_loss": 0.958555281162262,
75
+ "eval_runtime": 178.7319,
76
+ "eval_samples_per_second": 174.115,
77
+ "eval_steps_per_second": 10.882,
78
+ "step": 17505
79
+ },
80
+ {
81
+ "epoch": 2.4,
82
+ "learning_rate": 4.952002741917057e-05,
83
+ "loss": 0.9736,
84
+ "step": 21006
85
+ },
86
+ {
87
+ "epoch": 2.4,
88
+ "eval_loss": 0.9602962136268616,
89
+ "eval_runtime": 178.815,
90
+ "eval_samples_per_second": 174.035,
91
+ "eval_steps_per_second": 10.877,
92
+ "step": 21006
93
+ },
94
+ {
95
+ "epoch": 2.8,
96
+ "learning_rate": 4.9440031989032335e-05,
97
+ "loss": 0.9772,
98
+ "step": 24507
99
+ },
100
+ {
101
+ "epoch": 2.8,
102
+ "eval_loss": 0.9578276872634888,
103
+ "eval_runtime": 178.2789,
104
+ "eval_samples_per_second": 174.558,
105
+ "eval_steps_per_second": 10.91,
106
+ "step": 24507
107
+ },
108
+ {
109
+ "epoch": 3.2,
110
+ "learning_rate": 4.93600365588941e-05,
111
+ "loss": 0.9771,
112
+ "step": 28008
113
+ },
114
+ {
115
+ "epoch": 3.2,
116
+ "eval_loss": 0.9583005309104919,
117
+ "eval_runtime": 177.473,
118
+ "eval_samples_per_second": 175.351,
119
+ "eval_steps_per_second": 10.959,
120
+ "step": 28008
121
+ },
122
+ {
123
+ "epoch": 3.6,
124
+ "learning_rate": 4.928004112875585e-05,
125
+ "loss": 0.9773,
126
+ "step": 31509
127
+ },
128
+ {
129
+ "epoch": 3.6,
130
+ "eval_loss": 0.9513981938362122,
131
+ "eval_runtime": 177.4973,
132
+ "eval_samples_per_second": 175.327,
133
+ "eval_steps_per_second": 10.958,
134
+ "step": 31509
135
+ },
136
+ {
137
+ "epoch": 4.0,
138
+ "learning_rate": 4.920004569861762e-05,
139
+ "loss": 0.9802,
140
+ "step": 35010
141
+ },
142
+ {
143
+ "epoch": 4.0,
144
+ "eval_loss": 0.9571573138237,
145
+ "eval_runtime": 177.3934,
146
+ "eval_samples_per_second": 175.429,
147
+ "eval_steps_per_second": 10.964,
148
+ "step": 35010
149
+ },
150
+ {
151
+ "epoch": 4.4,
152
+ "learning_rate": 4.912005026847938e-05,
153
+ "loss": 0.9741,
154
+ "step": 38511
155
+ },
156
+ {
157
+ "epoch": 4.4,
158
+ "eval_loss": 0.9547442197799683,
159
+ "eval_runtime": 177.5692,
160
+ "eval_samples_per_second": 175.256,
161
+ "eval_steps_per_second": 10.953,
162
+ "step": 38511
163
+ },
164
+ {
165
+ "epoch": 4.8,
166
+ "learning_rate": 4.904005483834114e-05,
167
+ "loss": 0.9776,
168
+ "step": 42012
169
+ },
170
+ {
171
+ "epoch": 4.8,
172
+ "eval_loss": 0.9541487097740173,
173
+ "eval_runtime": 177.564,
174
+ "eval_samples_per_second": 175.261,
175
+ "eval_steps_per_second": 10.954,
176
+ "step": 42012
177
+ },
178
+ {
179
+ "epoch": 5.2,
180
+ "learning_rate": 4.8960059408202904e-05,
181
+ "loss": 0.9737,
182
+ "step": 45513
183
+ },
184
+ {
185
+ "epoch": 5.2,
186
+ "eval_loss": 0.9530224204063416,
187
+ "eval_runtime": 177.5256,
188
+ "eval_samples_per_second": 175.299,
189
+ "eval_steps_per_second": 10.956,
190
+ "step": 45513
191
+ },
192
+ {
193
+ "epoch": 5.6,
194
+ "learning_rate": 4.888006397806467e-05,
195
+ "loss": 0.9758,
196
+ "step": 49014
197
+ },
198
+ {
199
+ "epoch": 5.6,
200
+ "eval_loss": 0.9559855461120605,
201
+ "eval_runtime": 177.638,
202
+ "eval_samples_per_second": 175.188,
203
+ "eval_steps_per_second": 10.949,
204
+ "step": 49014
205
+ },
206
+ {
207
+ "epoch": 6.0,
208
+ "learning_rate": 4.880006854792642e-05,
209
+ "loss": 0.9777,
210
+ "step": 52515
211
+ },
212
+ {
213
+ "epoch": 6.0,
214
+ "eval_loss": 0.9567117691040039,
215
+ "eval_runtime": 177.402,
216
+ "eval_samples_per_second": 175.421,
217
+ "eval_steps_per_second": 10.964,
218
+ "step": 52515
219
+ },
220
+ {
221
+ "epoch": 6.4,
222
+ "learning_rate": 4.872007311778819e-05,
223
+ "loss": 0.9726,
224
+ "step": 56016
225
+ },
226
+ {
227
+ "epoch": 6.4,
228
+ "eval_loss": 0.9548764228820801,
229
+ "eval_runtime": 177.4726,
230
+ "eval_samples_per_second": 175.351,
231
+ "eval_steps_per_second": 10.959,
232
+ "step": 56016
233
+ },
234
+ {
235
+ "epoch": 6.8,
236
+ "learning_rate": 4.864007768764995e-05,
237
+ "loss": 0.977,
238
+ "step": 59517
239
+ },
240
+ {
241
+ "epoch": 6.8,
242
+ "eval_loss": 0.9567227959632874,
243
+ "eval_runtime": 177.3888,
244
+ "eval_samples_per_second": 175.434,
245
+ "eval_steps_per_second": 10.965,
246
+ "step": 59517
247
+ },
248
+ {
249
+ "epoch": 7.2,
250
+ "learning_rate": 4.856008225751171e-05,
251
+ "loss": 0.9736,
252
+ "step": 63018
253
+ },
254
+ {
255
+ "epoch": 7.2,
256
+ "eval_loss": 0.9573630094528198,
257
+ "eval_runtime": 177.3454,
258
+ "eval_samples_per_second": 175.477,
259
+ "eval_steps_per_second": 10.967,
260
+ "step": 63018
261
+ },
262
+ {
263
+ "epoch": 7.6,
264
+ "learning_rate": 4.8480086827373474e-05,
265
+ "loss": 0.9732,
266
+ "step": 66519
267
+ },
268
+ {
269
+ "epoch": 7.6,
270
+ "eval_loss": 0.9487555623054504,
271
+ "eval_runtime": 177.2246,
272
+ "eval_samples_per_second": 175.596,
273
+ "eval_steps_per_second": 10.975,
274
+ "step": 66519
275
+ },
276
+ {
277
+ "epoch": 8.0,
278
+ "learning_rate": 4.8400091397235236e-05,
279
+ "loss": 0.9743,
280
+ "step": 70020
281
+ },
282
+ {
283
+ "epoch": 8.0,
284
+ "eval_loss": 0.9511750936508179,
285
+ "eval_runtime": 177.2113,
286
+ "eval_samples_per_second": 175.61,
287
+ "eval_steps_per_second": 10.976,
288
+ "step": 70020
289
+ },
290
+ {
291
+ "epoch": 8.4,
292
+ "learning_rate": 4.832009596709699e-05,
293
+ "loss": 0.9684,
294
+ "step": 73521
295
+ },
296
+ {
297
+ "epoch": 8.4,
298
+ "eval_loss": 0.954649031162262,
299
+ "eval_runtime": 177.1791,
300
+ "eval_samples_per_second": 175.642,
301
+ "eval_steps_per_second": 10.978,
302
+ "step": 73521
303
+ },
304
+ {
305
+ "epoch": 8.8,
306
+ "learning_rate": 4.824010053695876e-05,
307
+ "loss": 0.974,
308
+ "step": 77022
309
+ },
310
+ {
311
+ "epoch": 8.8,
312
+ "eval_loss": 0.9532359838485718,
313
+ "eval_runtime": 177.517,
314
+ "eval_samples_per_second": 175.307,
315
+ "eval_steps_per_second": 10.957,
316
+ "step": 77022
317
+ },
318
+ {
319
+ "epoch": 9.2,
320
+ "learning_rate": 4.816010510682052e-05,
321
+ "loss": 0.9713,
322
+ "step": 80523
323
+ },
324
+ {
325
+ "epoch": 9.2,
326
+ "eval_loss": 0.9548117518424988,
327
+ "eval_runtime": 177.4324,
328
+ "eval_samples_per_second": 175.391,
329
+ "eval_steps_per_second": 10.962,
330
+ "step": 80523
331
+ },
332
+ {
333
+ "epoch": 9.6,
334
+ "learning_rate": 4.808010967668228e-05,
335
+ "loss": 0.9693,
336
+ "step": 84024
337
+ },
338
+ {
339
+ "epoch": 9.6,
340
+ "eval_loss": 0.9545753002166748,
341
+ "eval_runtime": 177.4929,
342
+ "eval_samples_per_second": 175.331,
343
+ "eval_steps_per_second": 10.958,
344
+ "step": 84024
345
+ },
346
+ {
347
+ "epoch": 10.0,
348
+ "learning_rate": 4.800011424654404e-05,
349
+ "loss": 0.9734,
350
+ "step": 87525
351
+ },
352
+ {
353
+ "epoch": 10.0,
354
+ "eval_loss": 0.9541465044021606,
355
+ "eval_runtime": 177.7477,
356
+ "eval_samples_per_second": 175.08,
357
+ "eval_steps_per_second": 10.942,
358
+ "step": 87525
359
+ },
360
+ {
361
+ "epoch": 10.4,
362
+ "learning_rate": 4.7920118816405806e-05,
363
+ "loss": 0.9681,
364
+ "step": 91026
365
+ },
366
+ {
367
+ "epoch": 10.4,
368
+ "eval_loss": 0.9508066177368164,
369
+ "eval_runtime": 177.4879,
370
+ "eval_samples_per_second": 175.336,
371
+ "eval_steps_per_second": 10.958,
372
+ "step": 91026
373
+ },
374
+ {
375
+ "epoch": 10.8,
376
+ "learning_rate": 4.784012338626757e-05,
377
+ "loss": 0.9713,
378
+ "step": 94527
379
+ },
380
+ {
381
+ "epoch": 10.8,
382
+ "eval_loss": 0.953546404838562,
383
+ "eval_runtime": 177.5133,
384
+ "eval_samples_per_second": 175.311,
385
+ "eval_steps_per_second": 10.957,
386
+ "step": 94527
387
+ },
388
+ {
389
+ "epoch": 11.2,
390
+ "learning_rate": 4.776012795612933e-05,
391
+ "loss": 0.9682,
392
+ "step": 98028
393
+ },
394
+ {
395
+ "epoch": 11.2,
396
+ "eval_loss": 0.9516591429710388,
397
+ "eval_runtime": 177.5065,
398
+ "eval_samples_per_second": 175.318,
399
+ "eval_steps_per_second": 10.957,
400
+ "step": 98028
401
+ },
402
+ {
403
+ "epoch": 11.6,
404
+ "learning_rate": 4.768013252599109e-05,
405
+ "loss": 0.9678,
406
+ "step": 101529
407
+ },
408
+ {
409
+ "epoch": 11.6,
410
+ "eval_loss": 0.9487663507461548,
411
+ "eval_runtime": 177.5275,
412
+ "eval_samples_per_second": 175.297,
413
+ "eval_steps_per_second": 10.956,
414
+ "step": 101529
415
+ },
416
+ {
417
+ "epoch": 12.0,
418
+ "learning_rate": 4.760013709585286e-05,
419
+ "loss": 0.9692,
420
+ "step": 105030
421
+ },
422
+ {
423
+ "epoch": 12.0,
424
+ "eval_loss": 0.9506123065948486,
425
+ "eval_runtime": 177.4334,
426
+ "eval_samples_per_second": 175.39,
427
+ "eval_steps_per_second": 10.962,
428
+ "step": 105030
429
+ },
430
+ {
431
+ "epoch": 12.4,
432
+ "learning_rate": 4.752014166571461e-05,
433
+ "loss": 0.9648,
434
+ "step": 108531
435
+ },
436
+ {
437
+ "epoch": 12.4,
438
+ "eval_loss": 0.9482792019844055,
439
+ "eval_runtime": 177.4207,
440
+ "eval_samples_per_second": 175.402,
441
+ "eval_steps_per_second": 10.963,
442
+ "step": 108531
443
+ },
444
+ {
445
+ "epoch": 12.8,
446
+ "learning_rate": 4.7440146235576376e-05,
447
+ "loss": 0.9664,
448
+ "step": 112032
449
+ },
450
+ {
451
+ "epoch": 12.8,
452
+ "eval_loss": 0.9506826400756836,
453
+ "eval_runtime": 177.4367,
454
+ "eval_samples_per_second": 175.386,
455
+ "eval_steps_per_second": 10.962,
456
+ "step": 112032
457
+ },
458
+ {
459
+ "epoch": 13.2,
460
+ "learning_rate": 4.736015080543814e-05,
461
+ "loss": 0.9656,
462
+ "step": 115533
463
+ },
464
+ {
465
+ "epoch": 13.2,
466
+ "eval_loss": 0.9522765874862671,
467
+ "eval_runtime": 177.6082,
468
+ "eval_samples_per_second": 175.217,
469
+ "eval_steps_per_second": 10.951,
470
+ "step": 115533
471
+ },
472
+ {
473
+ "epoch": 13.6,
474
+ "learning_rate": 4.72801553752999e-05,
475
+ "loss": 0.9678,
476
+ "step": 119034
477
+ },
478
+ {
479
+ "epoch": 13.6,
480
+ "eval_loss": 0.9502620100975037,
481
+ "eval_runtime": 177.507,
482
+ "eval_samples_per_second": 175.317,
483
+ "eval_steps_per_second": 10.957,
484
+ "step": 119034
485
+ },
486
+ {
487
+ "epoch": 14.0,
488
+ "learning_rate": 4.720015994516166e-05,
489
+ "loss": 0.9652,
490
+ "step": 122535
491
+ },
492
+ {
493
+ "epoch": 14.0,
494
+ "eval_loss": 0.9443088173866272,
495
+ "eval_runtime": 177.5507,
496
+ "eval_samples_per_second": 175.274,
497
+ "eval_steps_per_second": 10.955,
498
+ "step": 122535
499
+ },
500
+ {
501
+ "epoch": 14.4,
502
+ "learning_rate": 4.7120164515023426e-05,
503
+ "loss": 0.9624,
504
+ "step": 126036
505
+ },
506
+ {
507
+ "epoch": 14.4,
508
+ "eval_loss": 0.9469555020332336,
509
+ "eval_runtime": 177.4174,
510
+ "eval_samples_per_second": 175.406,
511
+ "eval_steps_per_second": 10.963,
512
+ "step": 126036
513
+ },
514
+ {
515
+ "epoch": 14.8,
516
+ "learning_rate": 4.704016908488518e-05,
517
+ "loss": 0.9632,
518
+ "step": 129537
519
+ },
520
+ {
521
+ "epoch": 14.8,
522
+ "eval_loss": 0.9473945498466492,
523
+ "eval_runtime": 177.5092,
524
+ "eval_samples_per_second": 175.315,
525
+ "eval_steps_per_second": 10.957,
526
+ "step": 129537
527
+ },
528
+ {
529
+ "epoch": 15.2,
530
+ "learning_rate": 4.6960173654746945e-05,
531
+ "loss": 0.9625,
532
+ "step": 133038
533
+ },
534
+ {
535
+ "epoch": 15.2,
536
+ "eval_loss": 0.948591947555542,
537
+ "eval_runtime": 177.5625,
538
+ "eval_samples_per_second": 175.262,
539
+ "eval_steps_per_second": 10.954,
540
+ "step": 133038
541
+ },
542
+ {
543
+ "epoch": 15.6,
544
+ "learning_rate": 4.688017822460871e-05,
545
+ "loss": 0.9606,
546
+ "step": 136539
547
+ },
548
+ {
549
+ "epoch": 15.6,
550
+ "eval_loss": 0.9465170502662659,
551
+ "eval_runtime": 177.5019,
552
+ "eval_samples_per_second": 175.322,
553
+ "eval_steps_per_second": 10.958,
554
+ "step": 136539
555
+ },
556
+ {
557
+ "epoch": 16.0,
558
+ "learning_rate": 4.680018279447047e-05,
559
+ "loss": 0.9623,
560
+ "step": 140040
561
+ },
562
+ {
563
+ "epoch": 16.0,
564
+ "eval_loss": 0.9485459327697754,
565
+ "eval_runtime": 177.4543,
566
+ "eval_samples_per_second": 175.369,
567
+ "eval_steps_per_second": 10.961,
568
+ "step": 140040
569
+ },
570
+ {
571
+ "epoch": 16.4,
572
+ "learning_rate": 4.6720187364332226e-05,
573
+ "loss": 0.9621,
574
+ "step": 143541
575
+ },
576
+ {
577
+ "epoch": 16.4,
578
+ "eval_loss": 0.9435310959815979,
579
+ "eval_runtime": 177.7019,
580
+ "eval_samples_per_second": 175.125,
581
+ "eval_steps_per_second": 10.945,
582
+ "step": 143541
583
+ },
584
+ {
585
+ "epoch": 16.8,
586
+ "learning_rate": 4.6640191934193996e-05,
587
+ "loss": 0.9608,
588
+ "step": 147042
589
+ },
590
+ {
591
+ "epoch": 16.8,
592
+ "eval_loss": 0.9458166360855103,
593
+ "eval_runtime": 177.4161,
594
+ "eval_samples_per_second": 175.407,
595
+ "eval_steps_per_second": 10.963,
596
+ "step": 147042
597
+ },
598
+ {
599
+ "epoch": 17.2,
600
+ "learning_rate": 4.656019650405575e-05,
601
+ "loss": 0.9607,
602
+ "step": 150543
603
+ },
604
+ {
605
+ "epoch": 17.2,
606
+ "eval_loss": 0.944205641746521,
607
+ "eval_runtime": 177.3072,
608
+ "eval_samples_per_second": 175.515,
609
+ "eval_steps_per_second": 10.97,
610
+ "step": 150543
611
+ },
612
+ {
613
+ "epoch": 17.6,
614
+ "learning_rate": 4.6480201073917515e-05,
615
+ "loss": 0.9573,
616
+ "step": 154044
617
+ },
618
+ {
619
+ "epoch": 17.6,
620
+ "eval_loss": 0.948502242565155,
621
+ "eval_runtime": 177.4753,
622
+ "eval_samples_per_second": 175.348,
623
+ "eval_steps_per_second": 10.959,
624
+ "step": 154044
625
+ },
626
+ {
627
+ "epoch": 18.0,
628
+ "learning_rate": 4.640020564377928e-05,
629
+ "loss": 0.9629,
630
+ "step": 157545
631
+ },
632
+ {
633
+ "epoch": 18.0,
634
+ "eval_loss": 0.9456846714019775,
635
+ "eval_runtime": 177.5771,
636
+ "eval_samples_per_second": 175.248,
637
+ "eval_steps_per_second": 10.953,
638
+ "step": 157545
639
+ },
640
+ {
641
+ "epoch": 18.4,
642
+ "learning_rate": 4.632021021364104e-05,
643
+ "loss": 0.9576,
644
+ "step": 161046
645
+ },
646
+ {
647
+ "epoch": 18.4,
648
+ "eval_loss": 0.9430428147315979,
649
+ "eval_runtime": 177.5195,
650
+ "eval_samples_per_second": 175.305,
651
+ "eval_steps_per_second": 10.957,
652
+ "step": 161046
653
+ },
654
+ {
655
+ "epoch": 18.8,
656
+ "learning_rate": 4.62402147835028e-05,
657
+ "loss": 0.9572,
658
+ "step": 164547
659
+ },
660
+ {
661
+ "epoch": 18.8,
662
+ "eval_loss": 0.945507824420929,
663
+ "eval_runtime": 177.5717,
664
+ "eval_samples_per_second": 175.253,
665
+ "eval_steps_per_second": 10.953,
666
+ "step": 164547
667
+ },
668
+ {
669
+ "epoch": 19.2,
670
+ "learning_rate": 4.6160219353364566e-05,
671
+ "loss": 0.9561,
672
+ "step": 168048
673
+ },
674
+ {
675
+ "epoch": 19.2,
676
+ "eval_loss": 0.9430563449859619,
677
+ "eval_runtime": 177.639,
678
+ "eval_samples_per_second": 175.187,
679
+ "eval_steps_per_second": 10.949,
680
+ "step": 168048
681
+ },
682
+ {
683
+ "epoch": 19.6,
684
+ "learning_rate": 4.608022392322632e-05,
685
+ "loss": 0.9584,
686
+ "step": 171549
687
+ },
688
+ {
689
+ "epoch": 19.6,
690
+ "eval_loss": 0.944654107093811,
691
+ "eval_runtime": 177.2628,
692
+ "eval_samples_per_second": 175.559,
693
+ "eval_steps_per_second": 10.972,
694
+ "step": 171549
695
+ },
696
+ {
697
+ "epoch": 20.0,
698
+ "learning_rate": 4.600022849308809e-05,
699
+ "loss": 0.9571,
700
+ "step": 175050
701
+ },
702
+ {
703
+ "epoch": 20.0,
704
+ "eval_loss": 0.9449816346168518,
705
+ "eval_runtime": 177.5343,
706
+ "eval_samples_per_second": 175.29,
707
+ "eval_steps_per_second": 10.956,
708
+ "step": 175050
709
+ },
710
+ {
711
+ "epoch": 20.4,
712
+ "learning_rate": 4.592023306294985e-05,
713
+ "loss": 0.9532,
714
+ "step": 178551
715
+ },
716
+ {
717
+ "epoch": 20.4,
718
+ "eval_loss": 0.9422610998153687,
719
+ "eval_runtime": 177.5589,
720
+ "eval_samples_per_second": 175.266,
721
+ "eval_steps_per_second": 10.954,
722
+ "step": 178551
723
+ },
724
+ {
725
+ "epoch": 20.8,
726
+ "learning_rate": 4.584023763281161e-05,
727
+ "loss": 0.955,
728
+ "step": 182052
729
+ },
730
+ {
731
+ "epoch": 20.8,
732
+ "eval_loss": 0.9425234794616699,
733
+ "eval_runtime": 177.4849,
734
+ "eval_samples_per_second": 175.339,
735
+ "eval_steps_per_second": 10.959,
736
+ "step": 182052
737
+ },
738
+ {
739
+ "epoch": 21.2,
740
+ "learning_rate": 4.576024220267337e-05,
741
+ "loss": 0.9542,
742
+ "step": 185553
743
+ },
744
+ {
745
+ "epoch": 21.2,
746
+ "eval_loss": 0.940060555934906,
747
+ "eval_runtime": 177.7301,
748
+ "eval_samples_per_second": 175.097,
749
+ "eval_steps_per_second": 10.944,
750
+ "step": 185553
751
+ },
752
+ {
753
+ "epoch": 21.6,
754
+ "learning_rate": 4.5680246772535135e-05,
755
+ "loss": 0.9512,
756
+ "step": 189054
757
+ },
758
+ {
759
+ "epoch": 21.6,
760
+ "eval_loss": 0.9436028599739075,
761
+ "eval_runtime": 177.5045,
762
+ "eval_samples_per_second": 175.319,
763
+ "eval_steps_per_second": 10.957,
764
+ "step": 189054
765
+ },
766
+ {
767
+ "epoch": 22.0,
768
+ "learning_rate": 4.560025134239689e-05,
769
+ "loss": 0.9556,
770
+ "step": 192555
771
+ },
772
+ {
773
+ "epoch": 22.0,
774
+ "eval_loss": 0.9408882260322571,
775
+ "eval_runtime": 177.5846,
776
+ "eval_samples_per_second": 175.24,
777
+ "eval_steps_per_second": 10.953,
778
+ "step": 192555
779
+ },
780
+ {
781
+ "epoch": 22.4,
782
+ "learning_rate": 4.552025591225866e-05,
783
+ "loss": 0.9528,
784
+ "step": 196056
785
+ },
786
+ {
787
+ "epoch": 22.4,
788
+ "eval_loss": 0.9434440732002258,
789
+ "eval_runtime": 177.5389,
790
+ "eval_samples_per_second": 175.286,
791
+ "eval_steps_per_second": 10.955,
792
+ "step": 196056
793
+ },
794
+ {
795
+ "epoch": 22.8,
796
+ "learning_rate": 4.5440260482120416e-05,
797
+ "loss": 0.9519,
798
+ "step": 199557
799
+ },
800
+ {
801
+ "epoch": 22.8,
802
+ "eval_loss": 0.9397256970405579,
803
+ "eval_runtime": 177.7195,
804
+ "eval_samples_per_second": 175.107,
805
+ "eval_steps_per_second": 10.944,
806
+ "step": 199557
807
+ },
808
+ {
809
+ "epoch": 23.2,
810
+ "learning_rate": 4.536026505198218e-05,
811
+ "loss": 0.9526,
812
+ "step": 203058
813
+ },
814
+ {
815
+ "epoch": 23.2,
816
+ "eval_loss": 0.9388800263404846,
817
+ "eval_runtime": 177.5696,
818
+ "eval_samples_per_second": 175.255,
819
+ "eval_steps_per_second": 10.953,
820
+ "step": 203058
821
+ },
822
+ {
823
+ "epoch": 23.6,
824
+ "learning_rate": 4.528026962184394e-05,
825
+ "loss": 0.9494,
826
+ "step": 206559
827
+ },
828
+ {
829
+ "epoch": 23.6,
830
+ "eval_loss": 0.93772292137146,
831
+ "eval_runtime": 177.5868,
832
+ "eval_samples_per_second": 175.238,
833
+ "eval_steps_per_second": 10.952,
834
+ "step": 206559
835
+ },
836
+ {
837
+ "epoch": 24.0,
838
+ "learning_rate": 4.5200274191705705e-05,
839
+ "loss": 0.953,
840
+ "step": 210060
841
+ },
842
+ {
843
+ "epoch": 24.0,
844
+ "eval_loss": 0.9386118054389954,
845
+ "eval_runtime": 177.6021,
846
+ "eval_samples_per_second": 175.223,
847
+ "eval_steps_per_second": 10.951,
848
+ "step": 210060
849
+ },
850
+ {
851
+ "epoch": 24.4,
852
+ "learning_rate": 4.512027876156746e-05,
853
+ "loss": 0.9472,
854
+ "step": 213561
855
+ },
856
+ {
857
+ "epoch": 24.4,
858
+ "eval_loss": 0.9392057061195374,
859
+ "eval_runtime": 177.5795,
860
+ "eval_samples_per_second": 175.245,
861
+ "eval_steps_per_second": 10.953,
862
+ "step": 213561
863
+ },
864
+ {
865
+ "epoch": 24.8,
866
+ "learning_rate": 4.504028333142923e-05,
867
+ "loss": 0.9505,
868
+ "step": 217062
869
+ },
870
+ {
871
+ "epoch": 24.8,
872
+ "eval_loss": 0.9383804798126221,
873
+ "eval_runtime": 177.5958,
874
+ "eval_samples_per_second": 175.229,
875
+ "eval_steps_per_second": 10.952,
876
+ "step": 217062
877
+ },
878
+ {
879
+ "epoch": 25.2,
880
+ "learning_rate": 4.4960287901290986e-05,
881
+ "loss": 0.9458,
882
+ "step": 220563
883
+ },
884
+ {
885
+ "epoch": 25.2,
886
+ "eval_loss": 0.9350699186325073,
887
+ "eval_runtime": 177.4438,
888
+ "eval_samples_per_second": 175.379,
889
+ "eval_steps_per_second": 10.961,
890
+ "step": 220563
891
+ },
892
+ {
893
+ "epoch": 25.6,
894
+ "learning_rate": 4.488029247115275e-05,
895
+ "loss": 0.9479,
896
+ "step": 224064
897
+ },
898
+ {
899
+ "epoch": 25.6,
900
+ "eval_loss": 0.9326021671295166,
901
+ "eval_runtime": 177.3212,
902
+ "eval_samples_per_second": 175.501,
903
+ "eval_steps_per_second": 10.969,
904
+ "step": 224064
905
+ },
906
+ {
907
+ "epoch": 26.0,
908
+ "learning_rate": 4.480029704101451e-05,
909
+ "loss": 0.9495,
910
+ "step": 227565
911
+ },
912
+ {
913
+ "epoch": 26.0,
914
+ "eval_loss": 0.9333738088607788,
915
+ "eval_runtime": 177.2781,
916
+ "eval_samples_per_second": 175.543,
917
+ "eval_steps_per_second": 10.971,
918
+ "step": 227565
919
+ },
920
+ {
921
+ "epoch": 26.4,
922
+ "learning_rate": 4.4720301610876274e-05,
923
+ "loss": 0.9434,
924
+ "step": 231066
925
+ },
926
+ {
927
+ "epoch": 26.4,
928
+ "eval_loss": 0.9343997836112976,
929
+ "eval_runtime": 177.4455,
930
+ "eval_samples_per_second": 175.378,
931
+ "eval_steps_per_second": 10.961,
932
+ "step": 231066
933
+ },
934
+ {
935
+ "epoch": 26.8,
936
+ "learning_rate": 4.464030618073804e-05,
937
+ "loss": 0.9475,
938
+ "step": 234567
939
+ },
940
+ {
941
+ "epoch": 26.8,
942
+ "eval_loss": 0.9394047856330872,
943
+ "eval_runtime": 177.4291,
944
+ "eval_samples_per_second": 175.394,
945
+ "eval_steps_per_second": 10.962,
946
+ "step": 234567
947
+ },
948
+ {
949
+ "epoch": 27.2,
950
+ "learning_rate": 4.45603107505998e-05,
951
+ "loss": 0.9479,
952
+ "step": 238068
953
+ },
954
+ {
955
+ "epoch": 27.2,
956
+ "eval_loss": 0.9336209893226624,
957
+ "eval_runtime": 177.3352,
958
+ "eval_samples_per_second": 175.487,
959
+ "eval_steps_per_second": 10.968,
960
+ "step": 238068
961
+ },
962
+ {
963
+ "epoch": 27.6,
964
+ "learning_rate": 4.4480315320461556e-05,
965
+ "loss": 0.9453,
966
+ "step": 241569
967
+ },
968
+ {
969
+ "epoch": 27.6,
970
+ "eval_loss": 0.9349226355552673,
971
+ "eval_runtime": 177.3043,
972
+ "eval_samples_per_second": 175.517,
973
+ "eval_steps_per_second": 10.97,
974
+ "step": 241569
975
+ },
976
+ {
977
+ "epoch": 28.0,
978
+ "learning_rate": 4.4400319890323325e-05,
979
+ "loss": 0.9458,
980
+ "step": 245070
981
+ },
982
+ {
983
+ "epoch": 28.0,
984
+ "eval_loss": 0.9337905645370483,
985
+ "eval_runtime": 177.3396,
986
+ "eval_samples_per_second": 175.482,
987
+ "eval_steps_per_second": 10.968,
988
+ "step": 245070
989
+ },
990
+ {
991
+ "epoch": 28.4,
992
+ "learning_rate": 4.432032446018508e-05,
993
+ "loss": 0.9411,
994
+ "step": 248571
995
+ },
996
+ {
997
+ "epoch": 28.4,
998
+ "eval_loss": 0.9311910271644592,
999
+ "eval_runtime": 177.2625,
1000
+ "eval_samples_per_second": 175.559,
1001
+ "eval_steps_per_second": 10.972,
1002
+ "step": 248571
1003
+ },
1004
+ {
1005
+ "epoch": 28.8,
1006
+ "learning_rate": 4.4240329030046844e-05,
1007
+ "loss": 0.9443,
1008
+ "step": 252072
1009
+ },
1010
+ {
1011
+ "epoch": 28.8,
1012
+ "eval_loss": 0.930526614189148,
1013
+ "eval_runtime": 177.3837,
1014
+ "eval_samples_per_second": 175.439,
1015
+ "eval_steps_per_second": 10.965,
1016
+ "step": 252072
1017
+ },
1018
+ {
1019
+ "epoch": 29.2,
1020
+ "learning_rate": 4.4160333599908606e-05,
1021
+ "loss": 0.9418,
1022
+ "step": 255573
1023
+ },
1024
+ {
1025
+ "epoch": 29.2,
1026
+ "eval_loss": 0.9349842667579651,
1027
+ "eval_runtime": 177.3409,
1028
+ "eval_samples_per_second": 175.481,
1029
+ "eval_steps_per_second": 10.968,
1030
+ "step": 255573
1031
+ },
1032
+ {
1033
+ "epoch": 29.6,
1034
+ "learning_rate": 4.408033816977037e-05,
1035
+ "loss": 0.9423,
1036
+ "step": 259074
1037
+ },
1038
+ {
1039
+ "epoch": 29.6,
1040
+ "eval_loss": 0.9312747716903687,
1041
+ "eval_runtime": 177.3094,
1042
+ "eval_samples_per_second": 175.512,
1043
+ "eval_steps_per_second": 10.97,
1044
+ "step": 259074
1045
+ },
1046
+ {
1047
+ "epoch": 30.0,
1048
+ "learning_rate": 4.4000342739632125e-05,
1049
+ "loss": 0.9441,
1050
+ "step": 262575
1051
+ },
1052
+ {
1053
+ "epoch": 30.0,
1054
+ "eval_loss": 0.9297969341278076,
1055
+ "eval_runtime": 177.4784,
1056
+ "eval_samples_per_second": 175.345,
1057
+ "eval_steps_per_second": 10.959,
1058
+ "step": 262575
1059
+ },
1060
+ {
1061
+ "epoch": 30.4,
1062
+ "learning_rate": 4.3920347309493895e-05,
1063
+ "loss": 0.9388,
1064
+ "step": 266076
1065
+ },
1066
+ {
1067
+ "epoch": 30.4,
1068
+ "eval_loss": 0.9298827052116394,
1069
+ "eval_runtime": 177.3405,
1070
+ "eval_samples_per_second": 175.482,
1071
+ "eval_steps_per_second": 10.968,
1072
+ "step": 266076
1073
+ },
1074
+ {
1075
+ "epoch": 30.8,
1076
+ "learning_rate": 4.384035187935565e-05,
1077
+ "loss": 0.9418,
1078
+ "step": 269577
1079
+ },
1080
+ {
1081
+ "epoch": 30.8,
1082
+ "eval_loss": 0.9313934445381165,
1083
+ "eval_runtime": 177.3424,
1084
+ "eval_samples_per_second": 175.48,
1085
+ "eval_steps_per_second": 10.967,
1086
+ "step": 269577
1087
+ },
1088
+ {
1089
+ "epoch": 31.2,
1090
+ "learning_rate": 4.376035644921741e-05,
1091
+ "loss": 0.9402,
1092
+ "step": 273078
1093
+ },
1094
+ {
1095
+ "epoch": 31.2,
1096
+ "eval_loss": 0.933687150478363,
1097
+ "eval_runtime": 177.2623,
1098
+ "eval_samples_per_second": 175.559,
1099
+ "eval_steps_per_second": 10.972,
1100
+ "step": 273078
1101
+ },
1102
+ {
1103
+ "epoch": 31.6,
1104
+ "learning_rate": 4.3680361019079176e-05,
1105
+ "loss": 0.9366,
1106
+ "step": 276579
1107
+ },
1108
+ {
1109
+ "epoch": 31.6,
1110
+ "eval_loss": 0.927956223487854,
1111
+ "eval_runtime": 177.2474,
1112
+ "eval_samples_per_second": 175.574,
1113
+ "eval_steps_per_second": 10.973,
1114
+ "step": 276579
1115
+ },
1116
+ {
1117
+ "epoch": 32.0,
1118
+ "learning_rate": 4.360036558894094e-05,
1119
+ "loss": 0.9406,
1120
+ "step": 280080
1121
+ },
1122
+ {
1123
+ "epoch": 32.0,
1124
+ "eval_loss": 0.9269111156463623,
1125
+ "eval_runtime": 177.2222,
1126
+ "eval_samples_per_second": 175.599,
1127
+ "eval_steps_per_second": 10.975,
1128
+ "step": 280080
1129
+ },
1130
+ {
1131
+ "epoch": 32.4,
1132
+ "learning_rate": 4.3520370158802695e-05,
1133
+ "loss": 0.9362,
1134
+ "step": 283581
1135
+ },
1136
+ {
1137
+ "epoch": 32.4,
1138
+ "eval_loss": 0.9323258399963379,
1139
+ "eval_runtime": 177.3606,
1140
+ "eval_samples_per_second": 175.462,
1141
+ "eval_steps_per_second": 10.966,
1142
+ "step": 283581
1143
+ },
1144
+ {
1145
+ "epoch": 32.8,
1146
+ "learning_rate": 4.3440374728664464e-05,
1147
+ "loss": 0.9389,
1148
+ "step": 287082
1149
+ },
1150
+ {
1151
+ "epoch": 32.8,
1152
+ "eval_loss": 0.9279223084449768,
1153
+ "eval_runtime": 177.4942,
1154
+ "eval_samples_per_second": 175.33,
1155
+ "eval_steps_per_second": 10.958,
1156
+ "step": 287082
1157
+ },
1158
+ {
1159
+ "epoch": 33.2,
1160
+ "learning_rate": 4.336037929852622e-05,
1161
+ "loss": 0.9347,
1162
+ "step": 290583
1163
+ },
1164
+ {
1165
+ "epoch": 33.2,
1166
+ "eval_loss": 0.9276158213615417,
1167
+ "eval_runtime": 177.5222,
1168
+ "eval_samples_per_second": 175.302,
1169
+ "eval_steps_per_second": 10.956,
1170
+ "step": 290583
1171
+ },
1172
+ {
1173
+ "epoch": 33.6,
1174
+ "learning_rate": 4.328038386838798e-05,
1175
+ "loss": 0.9361,
1176
+ "step": 294084
1177
+ },
1178
+ {
1179
+ "epoch": 33.6,
1180
+ "eval_loss": 0.93123459815979,
1181
+ "eval_runtime": 177.5834,
1182
+ "eval_samples_per_second": 175.242,
1183
+ "eval_steps_per_second": 10.953,
1184
+ "step": 294084
1185
+ },
1186
+ {
1187
+ "epoch": 34.0,
1188
+ "learning_rate": 4.3200388438249746e-05,
1189
+ "loss": 0.9382,
1190
+ "step": 297585
1191
+ },
1192
+ {
1193
+ "epoch": 34.0,
1194
+ "eval_loss": 0.928411602973938,
1195
+ "eval_runtime": 177.6079,
1196
+ "eval_samples_per_second": 175.217,
1197
+ "eval_steps_per_second": 10.951,
1198
+ "step": 297585
1199
+ },
1200
+ {
1201
+ "epoch": 34.4,
1202
+ "learning_rate": 4.312039300811151e-05,
1203
+ "loss": 0.9319,
1204
+ "step": 301086
1205
+ },
1206
+ {
1207
+ "epoch": 34.4,
1208
+ "eval_loss": 0.9270405173301697,
1209
+ "eval_runtime": 177.5221,
1210
+ "eval_samples_per_second": 175.302,
1211
+ "eval_steps_per_second": 10.956,
1212
+ "step": 301086
1213
+ },
1214
+ {
1215
+ "epoch": 34.8,
1216
+ "learning_rate": 4.304039757797327e-05,
1217
+ "loss": 0.9361,
1218
+ "step": 304587
1219
+ },
1220
+ {
1221
+ "epoch": 34.8,
1222
+ "eval_loss": 0.9271640777587891,
1223
+ "eval_runtime": 177.3566,
1224
+ "eval_samples_per_second": 175.466,
1225
+ "eval_steps_per_second": 10.967,
1226
+ "step": 304587
1227
+ },
1228
+ {
1229
+ "epoch": 35.2,
1230
+ "learning_rate": 4.2960402147835034e-05,
1231
+ "loss": 0.9339,
1232
+ "step": 308088
1233
+ },
1234
+ {
1235
+ "epoch": 35.2,
1236
+ "eval_loss": 0.9249849319458008,
1237
+ "eval_runtime": 177.2959,
1238
+ "eval_samples_per_second": 175.526,
1239
+ "eval_steps_per_second": 10.97,
1240
+ "step": 308088
1241
+ },
1242
+ {
1243
+ "epoch": 35.6,
1244
+ "learning_rate": 4.288040671769679e-05,
1245
+ "loss": 0.9329,
1246
+ "step": 311589
1247
+ },
1248
+ {
1249
+ "epoch": 35.6,
1250
+ "eval_loss": 0.9226134419441223,
1251
+ "eval_runtime": 177.4932,
1252
+ "eval_samples_per_second": 175.331,
1253
+ "eval_steps_per_second": 10.958,
1254
+ "step": 311589
1255
+ },
1256
+ {
1257
+ "epoch": 36.0,
1258
+ "learning_rate": 4.280041128755856e-05,
1259
+ "loss": 0.9364,
1260
+ "step": 315090
1261
+ },
1262
+ {
1263
+ "epoch": 36.0,
1264
+ "eval_loss": 0.9226271510124207,
1265
+ "eval_runtime": 177.4871,
1266
+ "eval_samples_per_second": 175.337,
1267
+ "eval_steps_per_second": 10.959,
1268
+ "step": 315090
1269
+ },
1270
+ {
1271
+ "epoch": 36.4,
1272
+ "learning_rate": 4.2720415857420315e-05,
1273
+ "loss": 0.9307,
1274
+ "step": 318591
1275
+ },
1276
+ {
1277
+ "epoch": 36.4,
1278
+ "eval_loss": 0.9248347878456116,
1279
+ "eval_runtime": 177.8038,
1280
+ "eval_samples_per_second": 175.024,
1281
+ "eval_steps_per_second": 10.939,
1282
+ "step": 318591
1283
+ },
1284
+ {
1285
+ "epoch": 36.8,
1286
+ "learning_rate": 4.264042042728208e-05,
1287
+ "loss": 0.9326,
1288
+ "step": 322092
1289
+ },
1290
+ {
1291
+ "epoch": 36.8,
1292
+ "eval_loss": 0.9248685836791992,
1293
+ "eval_runtime": 177.5153,
1294
+ "eval_samples_per_second": 175.309,
1295
+ "eval_steps_per_second": 10.957,
1296
+ "step": 322092
1297
+ },
1298
+ {
1299
+ "epoch": 37.2,
1300
+ "learning_rate": 4.256042499714384e-05,
1301
+ "loss": 0.932,
1302
+ "step": 325593
1303
+ },
1304
+ {
1305
+ "epoch": 37.2,
1306
+ "eval_loss": 0.9229134321212769,
1307
+ "eval_runtime": 177.4978,
1308
+ "eval_samples_per_second": 175.326,
1309
+ "eval_steps_per_second": 10.958,
1310
+ "step": 325593
1311
+ },
1312
+ {
1313
+ "epoch": 37.6,
1314
+ "learning_rate": 4.2480429567005596e-05,
1315
+ "loss": 0.9321,
1316
+ "step": 329094
1317
+ },
1318
+ {
1319
+ "epoch": 37.6,
1320
+ "eval_loss": 0.9236814975738525,
1321
+ "eval_runtime": 178.5737,
1322
+ "eval_samples_per_second": 174.27,
1323
+ "eval_steps_per_second": 10.892,
1324
+ "step": 329094
1325
+ },
1326
+ {
1327
+ "epoch": 38.0,
1328
+ "learning_rate": 4.240043413686736e-05,
1329
+ "loss": 0.9315,
1330
+ "step": 332595
1331
+ },
1332
+ {
1333
+ "epoch": 38.0,
1334
+ "eval_loss": 0.9243875741958618,
1335
+ "eval_runtime": 178.4244,
1336
+ "eval_samples_per_second": 174.416,
1337
+ "eval_steps_per_second": 10.901,
1338
+ "step": 332595
1339
+ },
1340
+ {
1341
+ "epoch": 38.4,
1342
+ "learning_rate": 4.232043870672912e-05,
1343
+ "loss": 0.9277,
1344
+ "step": 336096
1345
+ },
1346
+ {
1347
+ "epoch": 38.4,
1348
+ "eval_loss": 0.921384334564209,
1349
+ "eval_runtime": 178.5199,
1350
+ "eval_samples_per_second": 174.322,
1351
+ "eval_steps_per_second": 10.895,
1352
+ "step": 336096
1353
+ },
1354
+ {
1355
+ "epoch": 38.8,
1356
+ "learning_rate": 4.2240443276590885e-05,
1357
+ "loss": 0.9305,
1358
+ "step": 339597
1359
+ },
1360
+ {
1361
+ "epoch": 38.8,
1362
+ "eval_loss": 0.924416184425354,
1363
+ "eval_runtime": 178.4499,
1364
+ "eval_samples_per_second": 174.391,
1365
+ "eval_steps_per_second": 10.899,
1366
+ "step": 339597
1367
+ },
1368
+ {
1369
+ "epoch": 39.2,
1370
+ "learning_rate": 4.216044784645265e-05,
1371
+ "loss": 0.9264,
1372
+ "step": 343098
1373
+ },
1374
+ {
1375
+ "epoch": 39.2,
1376
+ "eval_loss": 0.9202448725700378,
1377
+ "eval_runtime": 178.5894,
1378
+ "eval_samples_per_second": 174.254,
1379
+ "eval_steps_per_second": 10.891,
1380
+ "step": 343098
1381
+ },
1382
+ {
1383
+ "epoch": 39.6,
1384
+ "learning_rate": 4.208045241631441e-05,
1385
+ "loss": 0.9272,
1386
+ "step": 346599
1387
+ },
1388
+ {
1389
+ "epoch": 39.6,
1390
+ "eval_loss": 0.9173216223716736,
1391
+ "eval_runtime": 178.587,
1392
+ "eval_samples_per_second": 174.257,
1393
+ "eval_steps_per_second": 10.891,
1394
+ "step": 346599
1395
+ },
1396
+ {
1397
+ "epoch": 40.0,
1398
+ "learning_rate": 4.2000456986176166e-05,
1399
+ "loss": 0.9292,
1400
+ "step": 350100
1401
+ },
1402
+ {
1403
+ "epoch": 40.0,
1404
+ "eval_loss": 0.9211925268173218,
1405
+ "eval_runtime": 177.5554,
1406
+ "eval_samples_per_second": 175.269,
1407
+ "eval_steps_per_second": 10.954,
1408
+ "step": 350100
1409
+ },
1410
+ {
1411
+ "epoch": 40.4,
1412
+ "learning_rate": 4.1920461556037935e-05,
1413
+ "loss": 0.9237,
1414
+ "step": 353601
1415
+ },
1416
+ {
1417
+ "epoch": 40.4,
1418
+ "eval_loss": 0.9185072183609009,
1419
+ "eval_runtime": 177.3103,
1420
+ "eval_samples_per_second": 175.512,
1421
+ "eval_steps_per_second": 10.969,
1422
+ "step": 353601
1423
+ },
1424
+ {
1425
+ "epoch": 40.8,
1426
+ "learning_rate": 4.184046612589969e-05,
1427
+ "loss": 0.9277,
1428
+ "step": 357102
1429
+ },
1430
+ {
1431
+ "epoch": 40.8,
1432
+ "eval_loss": 0.9214913845062256,
1433
+ "eval_runtime": 177.444,
1434
+ "eval_samples_per_second": 175.379,
1435
+ "eval_steps_per_second": 10.961,
1436
+ "step": 357102
1437
+ },
1438
+ {
1439
+ "epoch": 41.2,
1440
+ "learning_rate": 4.1760470695761454e-05,
1441
+ "loss": 0.9262,
1442
+ "step": 360603
1443
+ },
1444
+ {
1445
+ "epoch": 41.2,
1446
+ "eval_loss": 0.9183996319770813,
1447
+ "eval_runtime": 177.4174,
1448
+ "eval_samples_per_second": 175.406,
1449
+ "eval_steps_per_second": 10.963,
1450
+ "step": 360603
1451
+ },
1452
+ {
1453
+ "epoch": 41.6,
1454
+ "learning_rate": 4.168047526562322e-05,
1455
+ "loss": 0.9247,
1456
+ "step": 364104
1457
+ },
1458
+ {
1459
+ "epoch": 41.6,
1460
+ "eval_loss": 0.921753466129303,
1461
+ "eval_runtime": 177.4686,
1462
+ "eval_samples_per_second": 175.355,
1463
+ "eval_steps_per_second": 10.96,
1464
+ "step": 364104
1465
+ },
1466
+ {
1467
+ "epoch": 42.0,
1468
+ "learning_rate": 4.160047983548498e-05,
1469
+ "loss": 0.9265,
1470
+ "step": 367605
1471
+ },
1472
+ {
1473
+ "epoch": 42.0,
1474
+ "eval_loss": 0.9194549918174744,
1475
+ "eval_runtime": 177.4828,
1476
+ "eval_samples_per_second": 175.341,
1477
+ "eval_steps_per_second": 10.959,
1478
+ "step": 367605
1479
+ },
1480
+ {
1481
+ "epoch": 42.4,
1482
+ "learning_rate": 4.1520484405346736e-05,
1483
+ "loss": 0.9228,
1484
+ "step": 371106
1485
+ },
1486
+ {
1487
+ "epoch": 42.4,
1488
+ "eval_loss": 0.9182046055793762,
1489
+ "eval_runtime": 177.5745,
1490
+ "eval_samples_per_second": 175.25,
1491
+ "eval_steps_per_second": 10.953,
1492
+ "step": 371106
1493
+ },
1494
+ {
1495
+ "epoch": 42.8,
1496
+ "learning_rate": 4.1440488975208505e-05,
1497
+ "loss": 0.9239,
1498
+ "step": 374607
1499
+ },
1500
+ {
1501
+ "epoch": 42.8,
1502
+ "eval_loss": 0.9209604859352112,
1503
+ "eval_runtime": 177.5469,
1504
+ "eval_samples_per_second": 175.278,
1505
+ "eval_steps_per_second": 10.955,
1506
+ "step": 374607
1507
+ },
1508
+ {
1509
+ "epoch": 43.2,
1510
+ "learning_rate": 4.136049354507026e-05,
1511
+ "loss": 0.9234,
1512
+ "step": 378108
1513
+ },
1514
+ {
1515
+ "epoch": 43.2,
1516
+ "eval_loss": 0.9235459566116333,
1517
+ "eval_runtime": 177.4486,
1518
+ "eval_samples_per_second": 175.375,
1519
+ "eval_steps_per_second": 10.961,
1520
+ "step": 378108
1521
+ },
1522
+ {
1523
+ "epoch": 43.6,
1524
+ "learning_rate": 4.1280498114932024e-05,
1525
+ "loss": 0.9235,
1526
+ "step": 381609
1527
+ },
1528
+ {
1529
+ "epoch": 43.6,
1530
+ "eval_loss": 0.9178280234336853,
1531
+ "eval_runtime": 177.5134,
1532
+ "eval_samples_per_second": 175.311,
1533
+ "eval_steps_per_second": 10.957,
1534
+ "step": 381609
1535
+ },
1536
+ {
1537
+ "epoch": 44.0,
1538
+ "learning_rate": 4.1200502684793786e-05,
1539
+ "loss": 0.9251,
1540
+ "step": 385110
1541
+ },
1542
+ {
1543
+ "epoch": 44.0,
1544
+ "eval_loss": 0.913722574710846,
1545
+ "eval_runtime": 177.523,
1546
+ "eval_samples_per_second": 175.301,
1547
+ "eval_steps_per_second": 10.956,
1548
+ "step": 385110
1549
+ },
1550
+ {
1551
+ "epoch": 44.4,
1552
+ "learning_rate": 4.112050725465555e-05,
1553
+ "loss": 0.9188,
1554
+ "step": 388611
1555
+ },
1556
+ {
1557
+ "epoch": 44.4,
1558
+ "eval_loss": 0.9187389612197876,
1559
+ "eval_runtime": 177.5478,
1560
+ "eval_samples_per_second": 175.277,
1561
+ "eval_steps_per_second": 10.955,
1562
+ "step": 388611
1563
+ },
1564
+ {
1565
+ "epoch": 44.8,
1566
+ "learning_rate": 4.1040511824517305e-05,
1567
+ "loss": 0.9215,
1568
+ "step": 392112
1569
+ },
1570
+ {
1571
+ "epoch": 44.8,
1572
+ "eval_loss": 0.9166533946990967,
1573
+ "eval_runtime": 177.4821,
1574
+ "eval_samples_per_second": 175.342,
1575
+ "eval_steps_per_second": 10.959,
1576
+ "step": 392112
1577
+ },
1578
+ {
1579
+ "epoch": 45.2,
1580
+ "learning_rate": 4.0960516394379075e-05,
1581
+ "loss": 0.9203,
1582
+ "step": 395613
1583
+ },
1584
+ {
1585
+ "epoch": 45.2,
1586
+ "eval_loss": 0.9170865416526794,
1587
+ "eval_runtime": 177.4835,
1588
+ "eval_samples_per_second": 175.34,
1589
+ "eval_steps_per_second": 10.959,
1590
+ "step": 395613
1591
+ },
1592
+ {
1593
+ "epoch": 45.6,
1594
+ "learning_rate": 4.088052096424083e-05,
1595
+ "loss": 0.9206,
1596
+ "step": 399114
1597
+ },
1598
+ {
1599
+ "epoch": 45.6,
1600
+ "eval_loss": 0.9114164113998413,
1601
+ "eval_runtime": 177.3931,
1602
+ "eval_samples_per_second": 175.43,
1603
+ "eval_steps_per_second": 10.964,
1604
+ "step": 399114
1605
+ },
1606
+ {
1607
+ "epoch": 46.0,
1608
+ "learning_rate": 4.080052553410259e-05,
1609
+ "loss": 0.9226,
1610
+ "step": 402615
1611
+ },
1612
+ {
1613
+ "epoch": 46.0,
1614
+ "eval_loss": 0.9157847762107849,
1615
+ "eval_runtime": 177.4174,
1616
+ "eval_samples_per_second": 175.406,
1617
+ "eval_steps_per_second": 10.963,
1618
+ "step": 402615
1619
+ },
1620
+ {
1621
+ "epoch": 46.4,
1622
+ "learning_rate": 4.0720530103964356e-05,
1623
+ "loss": 0.9172,
1624
+ "step": 406116
1625
+ },
1626
+ {
1627
+ "epoch": 46.4,
1628
+ "eval_loss": 0.9159908294677734,
1629
+ "eval_runtime": 177.5134,
1630
+ "eval_samples_per_second": 175.311,
1631
+ "eval_steps_per_second": 10.957,
1632
+ "step": 406116
1633
+ },
1634
+ {
1635
+ "epoch": 46.8,
1636
+ "learning_rate": 4.064053467382612e-05,
1637
+ "loss": 0.918,
1638
+ "step": 409617
1639
+ },
1640
+ {
1641
+ "epoch": 46.8,
1642
+ "eval_loss": 0.9126763343811035,
1643
+ "eval_runtime": 177.4115,
1644
+ "eval_samples_per_second": 175.411,
1645
+ "eval_steps_per_second": 10.963,
1646
+ "step": 409617
1647
+ },
1648
+ {
1649
+ "epoch": 47.2,
1650
+ "learning_rate": 4.056053924368788e-05,
1651
+ "loss": 0.9177,
1652
+ "step": 413118
1653
+ },
1654
+ {
1655
+ "epoch": 47.2,
1656
+ "eval_loss": 0.9146538376808167,
1657
+ "eval_runtime": 177.5159,
1658
+ "eval_samples_per_second": 175.308,
1659
+ "eval_steps_per_second": 10.957,
1660
+ "step": 413118
1661
+ },
1662
+ {
1663
+ "epoch": 47.6,
1664
+ "learning_rate": 4.0480543813549644e-05,
1665
+ "loss": 0.918,
1666
+ "step": 416619
1667
+ },
1668
+ {
1669
+ "epoch": 47.6,
1670
+ "eval_loss": 0.9144261479377747,
1671
+ "eval_runtime": 177.4299,
1672
+ "eval_samples_per_second": 175.393,
1673
+ "eval_steps_per_second": 10.962,
1674
+ "step": 416619
1675
+ },
1676
+ {
1677
+ "epoch": 48.0,
1678
+ "learning_rate": 4.04005483834114e-05,
1679
+ "loss": 0.9186,
1680
+ "step": 420120
1681
+ },
1682
+ {
1683
+ "epoch": 48.0,
1684
+ "eval_loss": 0.9152446389198303,
1685
+ "eval_runtime": 177.4928,
1686
+ "eval_samples_per_second": 175.331,
1687
+ "eval_steps_per_second": 10.958,
1688
+ "step": 420120
1689
+ },
1690
+ {
1691
+ "epoch": 48.4,
1692
+ "learning_rate": 4.032055295327317e-05,
1693
+ "loss": 0.9145,
1694
+ "step": 423621
1695
+ },
1696
+ {
1697
+ "epoch": 48.4,
1698
+ "eval_loss": 0.9146500825881958,
1699
+ "eval_runtime": 177.6162,
1700
+ "eval_samples_per_second": 175.209,
1701
+ "eval_steps_per_second": 10.951,
1702
+ "step": 423621
1703
+ },
1704
+ {
1705
+ "epoch": 48.8,
1706
+ "learning_rate": 4.0240557523134926e-05,
1707
+ "loss": 0.9176,
1708
+ "step": 427122
1709
+ },
1710
+ {
1711
+ "epoch": 48.8,
1712
+ "eval_loss": 0.91416335105896,
1713
+ "eval_runtime": 177.5943,
1714
+ "eval_samples_per_second": 175.231,
1715
+ "eval_steps_per_second": 10.952,
1716
+ "step": 427122
1717
+ },
1718
+ {
1719
+ "epoch": 49.2,
1720
+ "learning_rate": 4.016056209299669e-05,
1721
+ "loss": 0.9146,
1722
+ "step": 430623
1723
+ },
1724
+ {
1725
+ "epoch": 49.2,
1726
+ "eval_loss": 0.9143691658973694,
1727
+ "eval_runtime": 177.6008,
1728
+ "eval_samples_per_second": 175.224,
1729
+ "eval_steps_per_second": 10.952,
1730
+ "step": 430623
1731
+ },
1732
+ {
1733
+ "epoch": 49.6,
1734
+ "learning_rate": 4.008056666285845e-05,
1735
+ "loss": 0.9163,
1736
+ "step": 434124
1737
+ },
1738
+ {
1739
+ "epoch": 49.6,
1740
+ "eval_loss": 0.9136433601379395,
1741
+ "eval_runtime": 177.4417,
1742
+ "eval_samples_per_second": 175.382,
1743
+ "eval_steps_per_second": 10.961,
1744
+ "step": 434124
1745
+ },
1746
+ {
1747
+ "epoch": 50.0,
1748
+ "learning_rate": 4.0000571232720214e-05,
1749
+ "loss": 0.9164,
1750
+ "step": 437625
1751
+ },
1752
+ {
1753
+ "epoch": 50.0,
1754
+ "eval_loss": 0.9115509390830994,
1755
+ "eval_runtime": 178.4645,
1756
+ "eval_samples_per_second": 174.376,
1757
+ "eval_steps_per_second": 10.899,
1758
+ "step": 437625
1759
+ },
1760
+ {
1761
+ "epoch": 50.4,
1762
+ "learning_rate": 3.992057580258197e-05,
1763
+ "loss": 0.9102,
1764
+ "step": 441126
1765
+ },
1766
+ {
1767
+ "epoch": 50.4,
1768
+ "eval_loss": 0.9133750796318054,
1769
+ "eval_runtime": 178.2605,
1770
+ "eval_samples_per_second": 174.576,
1771
+ "eval_steps_per_second": 10.911,
1772
+ "step": 441126
1773
+ },
1774
+ {
1775
+ "epoch": 50.8,
1776
+ "learning_rate": 3.984058037244374e-05,
1777
+ "loss": 0.9149,
1778
+ "step": 444627
1779
+ },
1780
+ {
1781
+ "epoch": 50.8,
1782
+ "eval_loss": 0.9099620580673218,
1783
+ "eval_runtime": 177.7712,
1784
+ "eval_samples_per_second": 175.056,
1785
+ "eval_steps_per_second": 10.941,
1786
+ "step": 444627
1787
+ },
1788
+ {
1789
+ "epoch": 51.2,
1790
+ "learning_rate": 3.9760584942305495e-05,
1791
+ "loss": 0.9155,
1792
+ "step": 448128
1793
+ },
1794
+ {
1795
+ "epoch": 51.2,
1796
+ "eval_loss": 0.9076240658760071,
1797
+ "eval_runtime": 177.4509,
1798
+ "eval_samples_per_second": 175.372,
1799
+ "eval_steps_per_second": 10.961,
1800
+ "step": 448128
1801
+ },
1802
+ {
1803
+ "epoch": 51.6,
1804
+ "learning_rate": 3.968058951216726e-05,
1805
+ "loss": 0.9128,
1806
+ "step": 451629
1807
+ },
1808
+ {
1809
+ "epoch": 51.6,
1810
+ "eval_loss": 0.9097868204116821,
1811
+ "eval_runtime": 177.4759,
1812
+ "eval_samples_per_second": 175.348,
1813
+ "eval_steps_per_second": 10.959,
1814
+ "step": 451629
1815
+ },
1816
+ {
1817
+ "epoch": 52.0,
1818
+ "learning_rate": 3.960059408202902e-05,
1819
+ "loss": 0.9128,
1820
+ "step": 455130
1821
+ },
1822
+ {
1823
+ "epoch": 52.0,
1824
+ "eval_loss": 0.9142981171607971,
1825
+ "eval_runtime": 177.8314,
1826
+ "eval_samples_per_second": 174.997,
1827
+ "eval_steps_per_second": 10.937,
1828
+ "step": 455130
1829
+ },
1830
+ {
1831
+ "epoch": 52.4,
1832
+ "learning_rate": 3.952059865189078e-05,
1833
+ "loss": 0.9098,
1834
+ "step": 458631
1835
+ },
1836
+ {
1837
+ "epoch": 52.4,
1838
+ "eval_loss": 0.9127160310745239,
1839
+ "eval_runtime": 177.596,
1840
+ "eval_samples_per_second": 175.229,
1841
+ "eval_steps_per_second": 10.952,
1842
+ "step": 458631
1843
+ },
1844
+ {
1845
+ "epoch": 52.8,
1846
+ "learning_rate": 3.944060322175254e-05,
1847
+ "loss": 0.9121,
1848
+ "step": 462132
1849
+ },
1850
+ {
1851
+ "epoch": 52.8,
1852
+ "eval_loss": 0.911176323890686,
1853
+ "eval_runtime": 177.6183,
1854
+ "eval_samples_per_second": 175.207,
1855
+ "eval_steps_per_second": 10.95,
1856
+ "step": 462132
1857
+ },
1858
+ {
1859
+ "epoch": 53.2,
1860
+ "learning_rate": 3.936060779161431e-05,
1861
+ "loss": 0.9128,
1862
+ "step": 465633
1863
+ },
1864
+ {
1865
+ "epoch": 53.2,
1866
+ "eval_loss": 0.9088250398635864,
1867
+ "eval_runtime": 177.3192,
1868
+ "eval_samples_per_second": 175.503,
1869
+ "eval_steps_per_second": 10.969,
1870
+ "step": 465633
1871
+ },
1872
+ {
1873
+ "epoch": 53.6,
1874
+ "learning_rate": 3.9280612361476065e-05,
1875
+ "loss": 0.9068,
1876
+ "step": 469134
1877
+ },
1878
+ {
1879
+ "epoch": 53.6,
1880
+ "eval_loss": 0.9073150157928467,
1881
+ "eval_runtime": 177.6629,
1882
+ "eval_samples_per_second": 175.163,
1883
+ "eval_steps_per_second": 10.948,
1884
+ "step": 469134
1885
+ },
1886
+ {
1887
+ "epoch": 54.0,
1888
+ "learning_rate": 3.920061693133783e-05,
1889
+ "loss": 0.9116,
1890
+ "step": 472635
1891
+ },
1892
+ {
1893
+ "epoch": 54.0,
1894
+ "eval_loss": 0.9075030088424683,
1895
+ "eval_runtime": 177.5925,
1896
+ "eval_samples_per_second": 175.233,
1897
+ "eval_steps_per_second": 10.952,
1898
+ "step": 472635
1899
+ },
1900
+ {
1901
+ "epoch": 54.4,
1902
+ "learning_rate": 3.912062150119959e-05,
1903
+ "loss": 0.9071,
1904
+ "step": 476136
1905
+ },
1906
+ {
1907
+ "epoch": 54.4,
1908
+ "eval_loss": 0.9067005515098572,
1909
+ "eval_runtime": 177.5266,
1910
+ "eval_samples_per_second": 175.298,
1911
+ "eval_steps_per_second": 10.956,
1912
+ "step": 476136
1913
+ },
1914
+ {
1915
+ "epoch": 54.8,
1916
+ "learning_rate": 3.904062607106135e-05,
1917
+ "loss": 0.9104,
1918
+ "step": 479637
1919
+ },
1920
+ {
1921
+ "epoch": 54.8,
1922
+ "eval_loss": 0.9073493480682373,
1923
+ "eval_runtime": 177.4472,
1924
+ "eval_samples_per_second": 175.376,
1925
+ "eval_steps_per_second": 10.961,
1926
+ "step": 479637
1927
+ },
1928
+ {
1929
+ "epoch": 55.2,
1930
+ "learning_rate": 3.8960630640923115e-05,
1931
+ "loss": 0.9054,
1932
+ "step": 483138
1933
+ },
1934
+ {
1935
+ "epoch": 55.2,
1936
+ "eval_loss": 0.9102022051811218,
1937
+ "eval_runtime": 177.6725,
1938
+ "eval_samples_per_second": 175.154,
1939
+ "eval_steps_per_second": 10.947,
1940
+ "step": 483138
1941
+ },
1942
+ {
1943
+ "epoch": 55.6,
1944
+ "learning_rate": 3.888063521078488e-05,
1945
+ "loss": 0.9097,
1946
+ "step": 486639
1947
+ },
1948
+ {
1949
+ "epoch": 55.6,
1950
+ "eval_loss": 0.9054126143455505,
1951
+ "eval_runtime": 177.6461,
1952
+ "eval_samples_per_second": 175.18,
1953
+ "eval_steps_per_second": 10.949,
1954
+ "step": 486639
1955
+ },
1956
+ {
1957
+ "epoch": 56.0,
1958
+ "learning_rate": 3.8800639780646634e-05,
1959
+ "loss": 0.9091,
1960
+ "step": 490140
1961
+ },
1962
+ {
1963
+ "epoch": 56.0,
1964
+ "eval_loss": 0.9067865014076233,
1965
+ "eval_runtime": 177.5605,
1966
+ "eval_samples_per_second": 175.264,
1967
+ "eval_steps_per_second": 10.954,
1968
+ "step": 490140
1969
+ },
1970
+ {
1971
+ "epoch": 56.4,
1972
+ "learning_rate": 3.8720644350508404e-05,
1973
+ "loss": 0.9047,
1974
+ "step": 493641
1975
+ },
1976
+ {
1977
+ "epoch": 56.4,
1978
+ "eval_loss": 0.9057883024215698,
1979
+ "eval_runtime": 177.3995,
1980
+ "eval_samples_per_second": 175.423,
1981
+ "eval_steps_per_second": 10.964,
1982
+ "step": 493641
1983
+ },
1984
+ {
1985
+ "epoch": 56.8,
1986
+ "learning_rate": 3.864064892037016e-05,
1987
+ "loss": 0.9076,
1988
+ "step": 497142
1989
+ },
1990
+ {
1991
+ "epoch": 56.8,
1992
+ "eval_loss": 0.9096718430519104,
1993
+ "eval_runtime": 177.7746,
1994
+ "eval_samples_per_second": 175.053,
1995
+ "eval_steps_per_second": 10.941,
1996
+ "step": 497142
1997
+ },
1998
+ {
1999
+ "epoch": 57.2,
2000
+ "learning_rate": 3.856065349023192e-05,
2001
+ "loss": 0.9052,
2002
+ "step": 500643
2003
+ },
2004
+ {
2005
+ "epoch": 57.2,
2006
+ "eval_loss": 0.9001969695091248,
2007
+ "eval_runtime": 177.6819,
2008
+ "eval_samples_per_second": 175.144,
2009
+ "eval_steps_per_second": 10.947,
2010
+ "step": 500643
2011
+ },
2012
+ {
2013
+ "epoch": 57.6,
2014
+ "learning_rate": 3.8480658060093685e-05,
2015
+ "loss": 0.9047,
2016
+ "step": 504144
2017
+ },
2018
+ {
2019
+ "epoch": 57.6,
2020
+ "eval_loss": 0.9067962765693665,
2021
+ "eval_runtime": 177.5691,
2022
+ "eval_samples_per_second": 175.256,
2023
+ "eval_steps_per_second": 10.953,
2024
+ "step": 504144
2025
+ },
2026
+ {
2027
+ "epoch": 58.0,
2028
+ "learning_rate": 3.840066262995545e-05,
2029
+ "loss": 0.9061,
2030
+ "step": 507645
2031
+ },
2032
+ {
2033
+ "epoch": 58.0,
2034
+ "eval_loss": 0.9069446921348572,
2035
+ "eval_runtime": 177.6225,
2036
+ "eval_samples_per_second": 175.203,
2037
+ "eval_steps_per_second": 10.95,
2038
+ "step": 507645
2039
+ },
2040
+ {
2041
+ "epoch": 58.4,
2042
+ "learning_rate": 3.8320667199817204e-05,
2043
+ "loss": 0.9004,
2044
+ "step": 511146
2045
+ },
2046
+ {
2047
+ "epoch": 58.4,
2048
+ "eval_loss": 0.9056394696235657,
2049
+ "eval_runtime": 177.6384,
2050
+ "eval_samples_per_second": 175.187,
2051
+ "eval_steps_per_second": 10.949,
2052
+ "step": 511146
2053
+ },
2054
+ {
2055
+ "epoch": 58.8,
2056
+ "learning_rate": 3.824067176967897e-05,
2057
+ "loss": 0.9056,
2058
+ "step": 514647
2059
+ },
2060
+ {
2061
+ "epoch": 58.8,
2062
+ "eval_loss": 0.9041665196418762,
2063
+ "eval_runtime": 177.5114,
2064
+ "eval_samples_per_second": 175.313,
2065
+ "eval_steps_per_second": 10.957,
2066
+ "step": 514647
2067
+ },
2068
+ {
2069
+ "epoch": 59.2,
2070
+ "learning_rate": 3.816067633954073e-05,
2071
+ "loss": 0.9038,
2072
+ "step": 518148
2073
+ },
2074
+ {
2075
+ "epoch": 59.2,
2076
+ "eval_loss": 0.9006583094596863,
2077
+ "eval_runtime": 177.5976,
2078
+ "eval_samples_per_second": 175.228,
2079
+ "eval_steps_per_second": 10.952,
2080
+ "step": 518148
2081
+ },
2082
+ {
2083
+ "epoch": 59.6,
2084
+ "learning_rate": 3.808068090940249e-05,
2085
+ "loss": 0.9033,
2086
+ "step": 521649
2087
+ },
2088
+ {
2089
+ "epoch": 59.6,
2090
+ "eval_loss": 0.905081570148468,
2091
+ "eval_runtime": 177.5456,
2092
+ "eval_samples_per_second": 175.279,
2093
+ "eval_steps_per_second": 10.955,
2094
+ "step": 521649
2095
+ },
2096
+ {
2097
+ "epoch": 60.0,
2098
+ "learning_rate": 3.8000685479264255e-05,
2099
+ "loss": 0.903,
2100
+ "step": 525150
2101
+ },
2102
+ {
2103
+ "epoch": 60.0,
2104
+ "eval_loss": 0.9015125632286072,
2105
+ "eval_runtime": 177.5682,
2106
+ "eval_samples_per_second": 175.257,
2107
+ "eval_steps_per_second": 10.954,
2108
+ "step": 525150
2109
+ },
2110
+ {
2111
+ "epoch": 60.4,
2112
+ "learning_rate": 3.792069004912602e-05,
2113
+ "loss": 0.9001,
2114
+ "step": 528651
2115
+ },
2116
+ {
2117
+ "epoch": 60.4,
2118
+ "eval_loss": 0.9008970856666565,
2119
+ "eval_runtime": 177.5673,
2120
+ "eval_samples_per_second": 175.258,
2121
+ "eval_steps_per_second": 10.954,
2122
+ "step": 528651
2123
+ },
2124
+ {
2125
+ "epoch": 60.8,
2126
+ "learning_rate": 3.784069461898777e-05,
2127
+ "loss": 0.9039,
2128
+ "step": 532152
2129
+ },
2130
+ {
2131
+ "epoch": 60.8,
2132
+ "eval_loss": 0.8996139764785767,
2133
+ "eval_runtime": 177.5892,
2134
+ "eval_samples_per_second": 175.236,
2135
+ "eval_steps_per_second": 10.952,
2136
+ "step": 532152
2137
+ },
2138
+ {
2139
+ "epoch": 61.2,
2140
+ "learning_rate": 3.776069918884954e-05,
2141
+ "loss": 0.9026,
2142
+ "step": 535653
2143
+ },
2144
+ {
2145
+ "epoch": 61.2,
2146
+ "eval_loss": 0.9031299948692322,
2147
+ "eval_runtime": 177.6018,
2148
+ "eval_samples_per_second": 175.223,
2149
+ "eval_steps_per_second": 10.951,
2150
+ "step": 535653
2151
+ },
2152
+ {
2153
+ "epoch": 61.6,
2154
+ "learning_rate": 3.76807037587113e-05,
2155
+ "loss": 0.901,
2156
+ "step": 539154
2157
+ },
2158
+ {
2159
+ "epoch": 61.6,
2160
+ "eval_loss": 0.9000225067138672,
2161
+ "eval_runtime": 177.4957,
2162
+ "eval_samples_per_second": 175.328,
2163
+ "eval_steps_per_second": 10.958,
2164
+ "step": 539154
2165
+ },
2166
+ {
2167
+ "epoch": 62.0,
2168
+ "learning_rate": 3.760070832857306e-05,
2169
+ "loss": 0.9026,
2170
+ "step": 542655
2171
+ },
2172
+ {
2173
+ "epoch": 62.0,
2174
+ "eval_loss": 0.8991663455963135,
2175
+ "eval_runtime": 177.5752,
2176
+ "eval_samples_per_second": 175.25,
2177
+ "eval_steps_per_second": 10.953,
2178
+ "step": 542655
2179
+ },
2180
+ {
2181
+ "epoch": 62.4,
2182
+ "learning_rate": 3.7520712898434824e-05,
2183
+ "loss": 0.8988,
2184
+ "step": 546156
2185
+ },
2186
+ {
2187
+ "epoch": 62.4,
2188
+ "eval_loss": 0.9008954763412476,
2189
+ "eval_runtime": 177.533,
2190
+ "eval_samples_per_second": 175.291,
2191
+ "eval_steps_per_second": 10.956,
2192
+ "step": 546156
2193
+ },
2194
+ {
2195
+ "epoch": 62.8,
2196
+ "learning_rate": 3.744071746829659e-05,
2197
+ "loss": 0.8994,
2198
+ "step": 549657
2199
+ },
2200
+ {
2201
+ "epoch": 62.8,
2202
+ "eval_loss": 0.9005922675132751,
2203
+ "eval_runtime": 177.6151,
2204
+ "eval_samples_per_second": 175.21,
2205
+ "eval_steps_per_second": 10.951,
2206
+ "step": 549657
2207
+ },
2208
+ {
2209
+ "epoch": 63.2,
2210
+ "learning_rate": 3.736072203815835e-05,
2211
+ "loss": 0.8983,
2212
+ "step": 553158
2213
+ },
2214
+ {
2215
+ "epoch": 63.2,
2216
+ "eval_loss": 0.9030284881591797,
2217
+ "eval_runtime": 177.5458,
2218
+ "eval_samples_per_second": 175.279,
2219
+ "eval_steps_per_second": 10.955,
2220
+ "step": 553158
2221
+ },
2222
+ {
2223
+ "epoch": 63.6,
2224
+ "learning_rate": 3.728072660802011e-05,
2225
+ "loss": 0.9,
2226
+ "step": 556659
2227
+ },
2228
+ {
2229
+ "epoch": 63.6,
2230
+ "eval_loss": 0.8991916179656982,
2231
+ "eval_runtime": 177.4727,
2232
+ "eval_samples_per_second": 175.351,
2233
+ "eval_steps_per_second": 10.959,
2234
+ "step": 556659
2235
+ },
2236
+ {
2237
+ "epoch": 64.0,
2238
+ "learning_rate": 3.720073117788187e-05,
2239
+ "loss": 0.8994,
2240
+ "step": 560160
2241
+ },
2242
+ {
2243
+ "epoch": 64.0,
2244
+ "eval_loss": 0.8985081315040588,
2245
+ "eval_runtime": 177.5551,
2246
+ "eval_samples_per_second": 175.27,
2247
+ "eval_steps_per_second": 10.954,
2248
+ "step": 560160
2249
+ },
2250
+ {
2251
+ "epoch": 64.4,
2252
+ "learning_rate": 3.712073574774364e-05,
2253
+ "loss": 0.8953,
2254
+ "step": 563661
2255
+ },
2256
+ {
2257
+ "epoch": 64.4,
2258
+ "eval_loss": 0.900728166103363,
2259
+ "eval_runtime": 177.7452,
2260
+ "eval_samples_per_second": 175.082,
2261
+ "eval_steps_per_second": 10.943,
2262
+ "step": 563661
2263
+ },
2264
+ {
2265
+ "epoch": 64.8,
2266
+ "learning_rate": 3.7040740317605394e-05,
2267
+ "loss": 0.8991,
2268
+ "step": 567162
2269
+ },
2270
+ {
2271
+ "epoch": 64.8,
2272
+ "eval_loss": 0.902450680732727,
2273
+ "eval_runtime": 177.6955,
2274
+ "eval_samples_per_second": 175.131,
2275
+ "eval_steps_per_second": 10.946,
2276
+ "step": 567162
2277
+ },
2278
+ {
2279
+ "epoch": 65.2,
2280
+ "learning_rate": 3.6960744887467156e-05,
2281
+ "loss": 0.8964,
2282
+ "step": 570663
2283
+ },
2284
+ {
2285
+ "epoch": 65.2,
2286
+ "eval_loss": 0.8983866572380066,
2287
+ "eval_runtime": 177.5924,
2288
+ "eval_samples_per_second": 175.233,
2289
+ "eval_steps_per_second": 10.952,
2290
+ "step": 570663
2291
+ },
2292
+ {
2293
+ "epoch": 65.6,
2294
+ "learning_rate": 3.688074945732892e-05,
2295
+ "loss": 0.8959,
2296
+ "step": 574164
2297
+ },
2298
+ {
2299
+ "epoch": 65.6,
2300
+ "eval_loss": 0.9006548523902893,
2301
+ "eval_runtime": 177.7333,
2302
+ "eval_samples_per_second": 175.094,
2303
+ "eval_steps_per_second": 10.943,
2304
+ "step": 574164
2305
+ },
2306
+ {
2307
+ "epoch": 66.0,
2308
+ "learning_rate": 3.680075402719068e-05,
2309
+ "loss": 0.8981,
2310
+ "step": 577665
2311
+ },
2312
+ {
2313
+ "epoch": 66.0,
2314
+ "eval_loss": 0.8985300064086914,
2315
+ "eval_runtime": 177.5714,
2316
+ "eval_samples_per_second": 175.253,
2317
+ "eval_steps_per_second": 10.953,
2318
+ "step": 577665
2319
+ },
2320
+ {
2321
+ "epoch": 66.4,
2322
+ "learning_rate": 3.672075859705244e-05,
2323
+ "loss": 0.8922,
2324
+ "step": 581166
2325
+ },
2326
+ {
2327
+ "epoch": 66.4,
2328
+ "eval_loss": 0.8925117254257202,
2329
+ "eval_runtime": 177.5096,
2330
+ "eval_samples_per_second": 175.314,
2331
+ "eval_steps_per_second": 10.957,
2332
+ "step": 581166
2333
+ },
2334
+ {
2335
+ "epoch": 66.8,
2336
+ "learning_rate": 3.664076316691421e-05,
2337
+ "loss": 0.896,
2338
+ "step": 584667
2339
+ },
2340
+ {
2341
+ "epoch": 66.8,
2342
+ "eval_loss": 0.8964714407920837,
2343
+ "eval_runtime": 177.5858,
2344
+ "eval_samples_per_second": 175.239,
2345
+ "eval_steps_per_second": 10.952,
2346
+ "step": 584667
2347
+ },
2348
+ {
2349
+ "epoch": 67.2,
2350
+ "learning_rate": 3.656076773677596e-05,
2351
+ "loss": 0.8942,
2352
+ "step": 588168
2353
+ },
2354
+ {
2355
+ "epoch": 67.2,
2356
+ "eval_loss": 0.8949043154716492,
2357
+ "eval_runtime": 177.7347,
2358
+ "eval_samples_per_second": 175.092,
2359
+ "eval_steps_per_second": 10.943,
2360
+ "step": 588168
2361
+ },
2362
+ {
2363
+ "epoch": 67.6,
2364
+ "learning_rate": 3.6480772306637726e-05,
2365
+ "loss": 0.8937,
2366
+ "step": 591669
2367
+ },
2368
+ {
2369
+ "epoch": 67.6,
2370
+ "eval_loss": 0.8952317237854004,
2371
+ "eval_runtime": 177.6327,
2372
+ "eval_samples_per_second": 175.193,
2373
+ "eval_steps_per_second": 10.95,
2374
+ "step": 591669
2375
+ },
2376
+ {
2377
+ "epoch": 68.0,
2378
+ "learning_rate": 3.640077687649949e-05,
2379
+ "loss": 0.8953,
2380
+ "step": 595170
2381
+ },
2382
+ {
2383
+ "epoch": 68.0,
2384
+ "eval_loss": 0.894290566444397,
2385
+ "eval_runtime": 177.4358,
2386
+ "eval_samples_per_second": 175.387,
2387
+ "eval_steps_per_second": 10.962,
2388
+ "step": 595170
2389
+ },
2390
+ {
2391
+ "epoch": 68.4,
2392
+ "learning_rate": 3.632078144636125e-05,
2393
+ "loss": 0.8907,
2394
+ "step": 598671
2395
+ },
2396
+ {
2397
+ "epoch": 68.4,
2398
+ "eval_loss": 0.8952488899230957,
2399
+ "eval_runtime": 177.6568,
2400
+ "eval_samples_per_second": 175.169,
2401
+ "eval_steps_per_second": 10.948,
2402
+ "step": 598671
2403
+ },
2404
+ {
2405
+ "epoch": 68.8,
2406
+ "learning_rate": 3.624078601622301e-05,
2407
+ "loss": 0.8919,
2408
+ "step": 602172
2409
+ },
2410
+ {
2411
+ "epoch": 68.8,
2412
+ "eval_loss": 0.8931904435157776,
2413
+ "eval_runtime": 177.5937,
2414
+ "eval_samples_per_second": 175.231,
2415
+ "eval_steps_per_second": 10.952,
2416
+ "step": 602172
2417
+ },
2418
+ {
2419
+ "epoch": 69.2,
2420
+ "learning_rate": 3.616079058608478e-05,
2421
+ "loss": 0.8933,
2422
+ "step": 605673
2423
+ },
2424
+ {
2425
+ "epoch": 69.2,
2426
+ "eval_loss": 0.89773029088974,
2427
+ "eval_runtime": 177.7324,
2428
+ "eval_samples_per_second": 175.095,
2429
+ "eval_steps_per_second": 10.943,
2430
+ "step": 605673
2431
+ },
2432
+ {
2433
+ "epoch": 69.6,
2434
+ "learning_rate": 3.608079515594653e-05,
2435
+ "loss": 0.891,
2436
+ "step": 609174
2437
+ },
2438
+ {
2439
+ "epoch": 69.6,
2440
+ "eval_loss": 0.894548773765564,
2441
+ "eval_runtime": 177.5908,
2442
+ "eval_samples_per_second": 175.234,
2443
+ "eval_steps_per_second": 10.952,
2444
+ "step": 609174
2445
+ },
2446
+ {
2447
+ "epoch": 70.0,
2448
+ "learning_rate": 3.6000799725808295e-05,
2449
+ "loss": 0.8932,
2450
+ "step": 612675
2451
+ },
2452
+ {
2453
+ "epoch": 70.0,
2454
+ "eval_loss": 0.8936890959739685,
2455
+ "eval_runtime": 177.5711,
2456
+ "eval_samples_per_second": 175.254,
2457
+ "eval_steps_per_second": 10.953,
2458
+ "step": 612675
2459
+ },
2460
+ {
2461
+ "epoch": 70.4,
2462
+ "learning_rate": 3.592080429567006e-05,
2463
+ "loss": 0.8882,
2464
+ "step": 616176
2465
+ },
2466
+ {
2467
+ "epoch": 70.4,
2468
+ "eval_loss": 0.8970974683761597,
2469
+ "eval_runtime": 177.5388,
2470
+ "eval_samples_per_second": 175.286,
2471
+ "eval_steps_per_second": 10.955,
2472
+ "step": 616176
2473
+ },
2474
+ {
2475
+ "epoch": 70.8,
2476
+ "learning_rate": 3.584080886553182e-05,
2477
+ "loss": 0.8907,
2478
+ "step": 619677
2479
+ },
2480
+ {
2481
+ "epoch": 70.8,
2482
+ "eval_loss": 0.8894772529602051,
2483
+ "eval_runtime": 178.5288,
2484
+ "eval_samples_per_second": 174.314,
2485
+ "eval_steps_per_second": 10.895,
2486
+ "step": 619677
2487
+ },
2488
+ {
2489
+ "epoch": 71.2,
2490
+ "learning_rate": 3.5760813435393584e-05,
2491
+ "loss": 0.8893,
2492
+ "step": 623178
2493
+ },
2494
+ {
2495
+ "epoch": 71.2,
2496
+ "eval_loss": 0.8943666219711304,
2497
+ "eval_runtime": 177.452,
2498
+ "eval_samples_per_second": 175.371,
2499
+ "eval_steps_per_second": 10.961,
2500
+ "step": 623178
2501
+ },
2502
+ {
2503
+ "epoch": 71.6,
2504
+ "learning_rate": 3.5680818005255346e-05,
2505
+ "loss": 0.8883,
2506
+ "step": 626679
2507
+ },
2508
+ {
2509
+ "epoch": 71.6,
2510
+ "eval_loss": 0.892691969871521,
2511
+ "eval_runtime": 177.2653,
2512
+ "eval_samples_per_second": 175.556,
2513
+ "eval_steps_per_second": 10.972,
2514
+ "step": 626679
2515
+ },
2516
+ {
2517
+ "epoch": 72.0,
2518
+ "learning_rate": 3.56008225751171e-05,
2519
+ "loss": 0.8917,
2520
+ "step": 630180
2521
+ },
2522
+ {
2523
+ "epoch": 72.0,
2524
+ "eval_loss": 0.890504777431488,
2525
+ "eval_runtime": 177.5314,
2526
+ "eval_samples_per_second": 175.293,
2527
+ "eval_steps_per_second": 10.956,
2528
+ "step": 630180
2529
+ },
2530
+ {
2531
+ "epoch": 72.4,
2532
+ "learning_rate": 3.552082714497887e-05,
2533
+ "loss": 0.8862,
2534
+ "step": 633681
2535
+ },
2536
+ {
2537
+ "epoch": 72.4,
2538
+ "eval_loss": 0.8900084495544434,
2539
+ "eval_runtime": 177.3085,
2540
+ "eval_samples_per_second": 175.513,
2541
+ "eval_steps_per_second": 10.97,
2542
+ "step": 633681
2543
+ },
2544
+ {
2545
+ "epoch": 72.8,
2546
+ "learning_rate": 3.544083171484063e-05,
2547
+ "loss": 0.8886,
2548
+ "step": 637182
2549
+ },
2550
+ {
2551
+ "epoch": 72.8,
2552
+ "eval_loss": 0.8953748941421509,
2553
+ "eval_runtime": 177.334,
2554
+ "eval_samples_per_second": 175.488,
2555
+ "eval_steps_per_second": 10.968,
2556
+ "step": 637182
2557
+ },
2558
+ {
2559
+ "epoch": 73.2,
2560
+ "learning_rate": 3.536083628470239e-05,
2561
+ "loss": 0.8874,
2562
+ "step": 640683
2563
+ },
2564
+ {
2565
+ "epoch": 73.2,
2566
+ "eval_loss": 0.8892679810523987,
2567
+ "eval_runtime": 177.3431,
2568
+ "eval_samples_per_second": 175.479,
2569
+ "eval_steps_per_second": 10.967,
2570
+ "step": 640683
2571
+ },
2572
+ {
2573
+ "epoch": 73.6,
2574
+ "learning_rate": 3.528084085456415e-05,
2575
+ "loss": 0.8866,
2576
+ "step": 644184
2577
+ },
2578
+ {
2579
+ "epoch": 73.6,
2580
+ "eval_loss": 0.8939085602760315,
2581
+ "eval_runtime": 177.3428,
2582
+ "eval_samples_per_second": 175.479,
2583
+ "eval_steps_per_second": 10.967,
2584
+ "step": 644184
2585
+ },
2586
+ {
2587
+ "epoch": 74.0,
2588
+ "learning_rate": 3.520084542442591e-05,
2589
+ "loss": 0.8893,
2590
+ "step": 647685
2591
+ },
2592
+ {
2593
+ "epoch": 74.0,
2594
+ "eval_loss": 0.8904389142990112,
2595
+ "eval_runtime": 177.4175,
2596
+ "eval_samples_per_second": 175.406,
2597
+ "eval_steps_per_second": 10.963,
2598
+ "step": 647685
2599
+ },
2600
+ {
2601
+ "epoch": 74.4,
2602
+ "learning_rate": 3.512084999428767e-05,
2603
+ "loss": 0.8838,
2604
+ "step": 651186
2605
+ },
2606
+ {
2607
+ "epoch": 74.4,
2608
+ "eval_loss": 0.8905112743377686,
2609
+ "eval_runtime": 177.3904,
2610
+ "eval_samples_per_second": 175.432,
2611
+ "eval_steps_per_second": 10.965,
2612
+ "step": 651186
2613
+ },
2614
+ {
2615
+ "epoch": 74.8,
2616
+ "learning_rate": 3.5040854564149435e-05,
2617
+ "loss": 0.8846,
2618
+ "step": 654687
2619
+ },
2620
+ {
2621
+ "epoch": 74.8,
2622
+ "eval_loss": 0.8922948241233826,
2623
+ "eval_runtime": 177.434,
2624
+ "eval_samples_per_second": 175.389,
2625
+ "eval_steps_per_second": 10.962,
2626
+ "step": 654687
2627
+ },
2628
+ {
2629
+ "epoch": 75.2,
2630
+ "learning_rate": 3.49608591340112e-05,
2631
+ "loss": 0.8862,
2632
+ "step": 658188
2633
+ },
2634
+ {
2635
+ "epoch": 75.2,
2636
+ "eval_loss": 0.8935458660125732,
2637
+ "eval_runtime": 178.2544,
2638
+ "eval_samples_per_second": 174.582,
2639
+ "eval_steps_per_second": 10.911,
2640
+ "step": 658188
2641
+ },
2642
+ {
2643
+ "epoch": 75.6,
2644
+ "learning_rate": 3.488086370387296e-05,
2645
+ "loss": 0.8832,
2646
+ "step": 661689
2647
+ },
2648
+ {
2649
+ "epoch": 75.6,
2650
+ "eval_loss": 0.8909444808959961,
2651
+ "eval_runtime": 177.513,
2652
+ "eval_samples_per_second": 175.311,
2653
+ "eval_steps_per_second": 10.957,
2654
+ "step": 661689
2655
+ },
2656
+ {
2657
+ "epoch": 76.0,
2658
+ "learning_rate": 3.480086827373472e-05,
2659
+ "loss": 0.8843,
2660
+ "step": 665190
2661
+ },
2662
+ {
2663
+ "epoch": 76.0,
2664
+ "eval_loss": 0.8939051032066345,
2665
+ "eval_runtime": 177.5511,
2666
+ "eval_samples_per_second": 175.273,
2667
+ "eval_steps_per_second": 10.955,
2668
+ "step": 665190
2669
+ },
2670
+ {
2671
+ "epoch": 76.4,
2672
+ "learning_rate": 3.472087284359648e-05,
2673
+ "loss": 0.8819,
2674
+ "step": 668691
2675
+ },
2676
+ {
2677
+ "epoch": 76.4,
2678
+ "eval_loss": 0.8863644599914551,
2679
+ "eval_runtime": 177.5405,
2680
+ "eval_samples_per_second": 175.284,
2681
+ "eval_steps_per_second": 10.955,
2682
+ "step": 668691
2683
+ },
2684
+ {
2685
+ "epoch": 76.8,
2686
+ "learning_rate": 3.464087741345825e-05,
2687
+ "loss": 0.8848,
2688
+ "step": 672192
2689
+ },
2690
+ {
2691
+ "epoch": 76.8,
2692
+ "eval_loss": 0.8871325850486755,
2693
+ "eval_runtime": 177.6785,
2694
+ "eval_samples_per_second": 175.148,
2695
+ "eval_steps_per_second": 10.947,
2696
+ "step": 672192
2697
+ },
2698
+ {
2699
+ "epoch": 77.2,
2700
+ "learning_rate": 3.4560881983320004e-05,
2701
+ "loss": 0.8825,
2702
+ "step": 675693
2703
+ },
2704
+ {
2705
+ "epoch": 77.2,
2706
+ "eval_loss": 0.8879318237304688,
2707
+ "eval_runtime": 177.5519,
2708
+ "eval_samples_per_second": 175.273,
2709
+ "eval_steps_per_second": 10.955,
2710
+ "step": 675693
2711
+ },
2712
+ {
2713
+ "epoch": 77.6,
2714
+ "learning_rate": 3.448088655318177e-05,
2715
+ "loss": 0.8812,
2716
+ "step": 679194
2717
+ },
2718
+ {
2719
+ "epoch": 77.6,
2720
+ "eval_loss": 0.88758385181427,
2721
+ "eval_runtime": 177.5541,
2722
+ "eval_samples_per_second": 175.271,
2723
+ "eval_steps_per_second": 10.954,
2724
+ "step": 679194
2725
+ },
2726
+ {
2727
+ "epoch": 78.0,
2728
+ "learning_rate": 3.440089112304353e-05,
2729
+ "loss": 0.8833,
2730
+ "step": 682695
2731
+ },
2732
+ {
2733
+ "epoch": 78.0,
2734
+ "eval_loss": 0.8914999961853027,
2735
+ "eval_runtime": 178.2748,
2736
+ "eval_samples_per_second": 174.562,
2737
+ "eval_steps_per_second": 10.91,
2738
+ "step": 682695
2739
+ },
2740
+ {
2741
+ "epoch": 78.4,
2742
+ "learning_rate": 3.432089569290529e-05,
2743
+ "loss": 0.8817,
2744
+ "step": 686196
2745
+ },
2746
+ {
2747
+ "epoch": 78.4,
2748
+ "eval_loss": 0.8898407816886902,
2749
+ "eval_runtime": 178.1701,
2750
+ "eval_samples_per_second": 174.665,
2751
+ "eval_steps_per_second": 10.917,
2752
+ "step": 686196
2753
+ },
2754
+ {
2755
+ "epoch": 78.8,
2756
+ "learning_rate": 3.424090026276705e-05,
2757
+ "loss": 0.8834,
2758
+ "step": 689697
2759
+ },
2760
+ {
2761
+ "epoch": 78.8,
2762
+ "eval_loss": 0.8876122236251831,
2763
+ "eval_runtime": 178.2895,
2764
+ "eval_samples_per_second": 174.548,
2765
+ "eval_steps_per_second": 10.909,
2766
+ "step": 689697
2767
+ },
2768
+ {
2769
+ "epoch": 79.2,
2770
+ "learning_rate": 3.416090483262882e-05,
2771
+ "loss": 0.8808,
2772
+ "step": 693198
2773
+ },
2774
+ {
2775
+ "epoch": 79.2,
2776
+ "eval_loss": 0.8858633041381836,
2777
+ "eval_runtime": 177.9937,
2778
+ "eval_samples_per_second": 174.838,
2779
+ "eval_steps_per_second": 10.927,
2780
+ "step": 693198
2781
+ },
2782
+ {
2783
+ "epoch": 79.6,
2784
+ "learning_rate": 3.4080909402490574e-05,
2785
+ "loss": 0.8801,
2786
+ "step": 696699
2787
+ },
2788
+ {
2789
+ "epoch": 79.6,
2790
+ "eval_loss": 0.8899697661399841,
2791
+ "eval_runtime": 178.1844,
2792
+ "eval_samples_per_second": 174.651,
2793
+ "eval_steps_per_second": 10.916,
2794
+ "step": 696699
2795
+ },
2796
+ {
2797
+ "epoch": 80.0,
2798
+ "learning_rate": 3.4000913972352336e-05,
2799
+ "loss": 0.8799,
2800
+ "step": 700200
2801
+ },
2802
+ {
2803
+ "epoch": 80.0,
2804
+ "eval_loss": 0.887626051902771,
2805
+ "eval_runtime": 178.2075,
2806
+ "eval_samples_per_second": 174.628,
2807
+ "eval_steps_per_second": 10.914,
2808
+ "step": 700200
2809
+ },
2810
+ {
2811
+ "epoch": 80.4,
2812
+ "learning_rate": 3.39209185422141e-05,
2813
+ "loss": 0.8774,
2814
+ "step": 703701
2815
+ },
2816
+ {
2817
+ "epoch": 80.4,
2818
+ "eval_loss": 0.8899440169334412,
2819
+ "eval_runtime": 178.0542,
2820
+ "eval_samples_per_second": 174.778,
2821
+ "eval_steps_per_second": 10.924,
2822
+ "step": 703701
2823
+ },
2824
+ {
2825
+ "epoch": 80.8,
2826
+ "learning_rate": 3.384092311207586e-05,
2827
+ "loss": 0.8798,
2828
+ "step": 707202
2829
+ },
2830
+ {
2831
+ "epoch": 80.8,
2832
+ "eval_loss": 0.8822316527366638,
2833
+ "eval_runtime": 178.2892,
2834
+ "eval_samples_per_second": 174.548,
2835
+ "eval_steps_per_second": 10.909,
2836
+ "step": 707202
2837
+ },
2838
+ {
2839
+ "epoch": 81.2,
2840
+ "learning_rate": 3.376092768193762e-05,
2841
+ "loss": 0.8783,
2842
+ "step": 710703
2843
+ },
2844
+ {
2845
+ "epoch": 81.2,
2846
+ "eval_loss": 0.889187216758728,
2847
+ "eval_runtime": 177.9073,
2848
+ "eval_samples_per_second": 174.923,
2849
+ "eval_steps_per_second": 10.933,
2850
+ "step": 710703
2851
+ },
2852
+ {
2853
+ "epoch": 81.6,
2854
+ "learning_rate": 3.368093225179939e-05,
2855
+ "loss": 0.879,
2856
+ "step": 714204
2857
+ },
2858
+ {
2859
+ "epoch": 81.6,
2860
+ "eval_loss": 0.8858596086502075,
2861
+ "eval_runtime": 177.9871,
2862
+ "eval_samples_per_second": 174.844,
2863
+ "eval_steps_per_second": 10.928,
2864
+ "step": 714204
2865
+ },
2866
+ {
2867
+ "epoch": 82.0,
2868
+ "learning_rate": 3.360093682166114e-05,
2869
+ "loss": 0.8805,
2870
+ "step": 717705
2871
+ },
2872
+ {
2873
+ "epoch": 82.0,
2874
+ "eval_loss": 0.8828133940696716,
2875
+ "eval_runtime": 178.0841,
2876
+ "eval_samples_per_second": 174.749,
2877
+ "eval_steps_per_second": 10.922,
2878
+ "step": 717705
2879
+ },
2880
+ {
2881
+ "epoch": 82.4,
2882
+ "learning_rate": 3.3520941391522906e-05,
2883
+ "loss": 0.8745,
2884
+ "step": 721206
2885
+ },
2886
+ {
2887
+ "epoch": 82.4,
2888
+ "eval_loss": 0.8833040595054626,
2889
+ "eval_runtime": 178.3724,
2890
+ "eval_samples_per_second": 174.466,
2891
+ "eval_steps_per_second": 10.904,
2892
+ "step": 721206
2893
+ },
2894
+ {
2895
+ "epoch": 82.8,
2896
+ "learning_rate": 3.344094596138467e-05,
2897
+ "loss": 0.8779,
2898
+ "step": 724707
2899
+ },
2900
+ {
2901
+ "epoch": 82.8,
2902
+ "eval_loss": 0.8839106559753418,
2903
+ "eval_runtime": 178.0087,
2904
+ "eval_samples_per_second": 174.823,
2905
+ "eval_steps_per_second": 10.926,
2906
+ "step": 724707
2907
+ },
2908
+ {
2909
+ "epoch": 83.2,
2910
+ "learning_rate": 3.336095053124643e-05,
2911
+ "loss": 0.8758,
2912
+ "step": 728208
2913
+ },
2914
+ {
2915
+ "epoch": 83.2,
2916
+ "eval_loss": 0.8863241076469421,
2917
+ "eval_runtime": 178.0608,
2918
+ "eval_samples_per_second": 174.772,
2919
+ "eval_steps_per_second": 10.923,
2920
+ "step": 728208
2921
+ },
2922
+ {
2923
+ "epoch": 83.6,
2924
+ "learning_rate": 3.3280955101108194e-05,
2925
+ "loss": 0.8747,
2926
+ "step": 731709
2927
+ },
2928
+ {
2929
+ "epoch": 83.6,
2930
+ "eval_loss": 0.8861810564994812,
2931
+ "eval_runtime": 178.0994,
2932
+ "eval_samples_per_second": 174.734,
2933
+ "eval_steps_per_second": 10.921,
2934
+ "step": 731709
2935
+ },
2936
+ {
2937
+ "epoch": 84.0,
2938
+ "learning_rate": 3.320095967096996e-05,
2939
+ "loss": 0.8768,
2940
+ "step": 735210
2941
+ },
2942
+ {
2943
+ "epoch": 84.0,
2944
+ "eval_loss": 0.8819161653518677,
2945
+ "eval_runtime": 178.1304,
2946
+ "eval_samples_per_second": 174.703,
2947
+ "eval_steps_per_second": 10.919,
2948
+ "step": 735210
2949
+ },
2950
+ {
2951
+ "epoch": 84.4,
2952
+ "learning_rate": 3.312096424083171e-05,
2953
+ "loss": 0.8721,
2954
+ "step": 738711
2955
+ },
2956
+ {
2957
+ "epoch": 84.4,
2958
+ "eval_loss": 0.8827645778656006,
2959
+ "eval_runtime": 178.1975,
2960
+ "eval_samples_per_second": 174.638,
2961
+ "eval_steps_per_second": 10.915,
2962
+ "step": 738711
2963
+ },
2964
+ {
2965
+ "epoch": 84.8,
2966
+ "learning_rate": 3.304096881069348e-05,
2967
+ "loss": 0.8762,
2968
+ "step": 742212
2969
+ },
2970
+ {
2971
+ "epoch": 84.8,
2972
+ "eval_loss": 0.8821256160736084,
2973
+ "eval_runtime": 178.2935,
2974
+ "eval_samples_per_second": 174.544,
2975
+ "eval_steps_per_second": 10.909,
2976
+ "step": 742212
2977
+ },
2978
+ {
2979
+ "epoch": 85.2,
2980
+ "learning_rate": 3.296097338055524e-05,
2981
+ "loss": 0.8745,
2982
+ "step": 745713
2983
+ },
2984
+ {
2985
+ "epoch": 85.2,
2986
+ "eval_loss": 0.8844091296195984,
2987
+ "eval_runtime": 178.1959,
2988
+ "eval_samples_per_second": 174.639,
2989
+ "eval_steps_per_second": 10.915,
2990
+ "step": 745713
2991
+ },
2992
+ {
2993
+ "epoch": 85.6,
2994
+ "learning_rate": 3.2880977950417e-05,
2995
+ "loss": 0.8718,
2996
+ "step": 749214
2997
+ },
2998
+ {
2999
+ "epoch": 85.6,
3000
+ "eval_loss": 0.8821405172348022,
3001
+ "eval_runtime": 178.5043,
3002
+ "eval_samples_per_second": 174.338,
3003
+ "eval_steps_per_second": 10.896,
3004
+ "step": 749214
3005
+ },
3006
+ {
3007
+ "epoch": 86.0,
3008
+ "learning_rate": 3.2800982520278764e-05,
3009
+ "loss": 0.8753,
3010
+ "step": 752715
3011
+ },
3012
+ {
3013
+ "epoch": 86.0,
3014
+ "eval_loss": 0.8849576711654663,
3015
+ "eval_runtime": 178.2889,
3016
+ "eval_samples_per_second": 174.548,
3017
+ "eval_steps_per_second": 10.909,
3018
+ "step": 752715
3019
+ },
3020
+ {
3021
+ "epoch": 86.4,
3022
+ "learning_rate": 3.2720987090140526e-05,
3023
+ "loss": 0.8702,
3024
+ "step": 756216
3025
+ },
3026
+ {
3027
+ "epoch": 86.4,
3028
+ "eval_loss": 0.8836557865142822,
3029
+ "eval_runtime": 178.073,
3030
+ "eval_samples_per_second": 174.76,
3031
+ "eval_steps_per_second": 10.922,
3032
+ "step": 756216
3033
+ },
3034
+ {
3035
+ "epoch": 86.8,
3036
+ "learning_rate": 3.264099166000228e-05,
3037
+ "loss": 0.8745,
3038
+ "step": 759717
3039
+ },
3040
+ {
3041
+ "epoch": 86.8,
3042
+ "eval_loss": 0.8812028765678406,
3043
+ "eval_runtime": 178.1701,
3044
+ "eval_samples_per_second": 174.665,
3045
+ "eval_steps_per_second": 10.917,
3046
+ "step": 759717
3047
+ },
3048
+ {
3049
+ "epoch": 87.2,
3050
+ "learning_rate": 3.256099622986405e-05,
3051
+ "loss": 0.8726,
3052
+ "step": 763218
3053
+ },
3054
+ {
3055
+ "epoch": 87.2,
3056
+ "eval_loss": 0.8863665461540222,
3057
+ "eval_runtime": 178.1003,
3058
+ "eval_samples_per_second": 174.733,
3059
+ "eval_steps_per_second": 10.921,
3060
+ "step": 763218
3061
+ },
3062
+ {
3063
+ "epoch": 87.59,
3064
+ "learning_rate": 3.248100079972581e-05,
3065
+ "loss": 0.8731,
3066
+ "step": 766719
3067
+ },
3068
+ {
3069
+ "epoch": 87.59,
3070
+ "eval_loss": 0.8802133798599243,
3071
+ "eval_runtime": 177.7361,
3072
+ "eval_samples_per_second": 175.091,
3073
+ "eval_steps_per_second": 10.943,
3074
+ "step": 766719
3075
+ },
3076
+ {
3077
+ "epoch": 87.99,
3078
+ "learning_rate": 3.240100536958757e-05,
3079
+ "loss": 0.8735,
3080
+ "step": 770220
3081
+ },
3082
+ {
3083
+ "epoch": 87.99,
3084
+ "eval_loss": 0.87850421667099,
3085
+ "eval_runtime": 178.0197,
3086
+ "eval_samples_per_second": 174.812,
3087
+ "eval_steps_per_second": 10.926,
3088
+ "step": 770220
3089
+ },
3090
+ {
3091
+ "epoch": 88.39,
3092
+ "learning_rate": 3.232100993944933e-05,
3093
+ "loss": 0.8684,
3094
+ "step": 773721
3095
+ },
3096
+ {
3097
+ "epoch": 88.39,
3098
+ "eval_loss": 0.8797884583473206,
3099
+ "eval_runtime": 177.9064,
3100
+ "eval_samples_per_second": 174.923,
3101
+ "eval_steps_per_second": 10.933,
3102
+ "step": 773721
3103
+ },
3104
+ {
3105
+ "epoch": 88.79,
3106
+ "learning_rate": 3.2241014509311096e-05,
3107
+ "loss": 0.8727,
3108
+ "step": 777222
3109
+ },
3110
+ {
3111
+ "epoch": 88.79,
3112
+ "eval_loss": 0.8834565281867981,
3113
+ "eval_runtime": 178.2741,
3114
+ "eval_samples_per_second": 174.563,
3115
+ "eval_steps_per_second": 10.91,
3116
+ "step": 777222
3117
+ },
3118
+ {
3119
+ "epoch": 89.19,
3120
+ "learning_rate": 3.216101907917285e-05,
3121
+ "loss": 0.8704,
3122
+ "step": 780723
3123
+ },
3124
+ {
3125
+ "epoch": 89.19,
3126
+ "eval_loss": 0.8808990120887756,
3127
+ "eval_runtime": 177.9944,
3128
+ "eval_samples_per_second": 174.837,
3129
+ "eval_steps_per_second": 10.927,
3130
+ "step": 780723
3131
+ },
3132
+ {
3133
+ "epoch": 89.59,
3134
+ "learning_rate": 3.208102364903462e-05,
3135
+ "loss": 0.8691,
3136
+ "step": 784224
3137
+ },
3138
+ {
3139
+ "epoch": 89.59,
3140
+ "eval_loss": 0.8784425854682922,
3141
+ "eval_runtime": 177.87,
3142
+ "eval_samples_per_second": 174.959,
3143
+ "eval_steps_per_second": 10.935,
3144
+ "step": 784224
3145
+ },
3146
+ {
3147
+ "epoch": 89.99,
3148
+ "learning_rate": 3.200102821889638e-05,
3149
+ "loss": 0.8718,
3150
+ "step": 787725
3151
+ },
3152
+ {
3153
+ "epoch": 89.99,
3154
+ "eval_loss": 0.8777753114700317,
3155
+ "eval_runtime": 178.0354,
3156
+ "eval_samples_per_second": 174.797,
3157
+ "eval_steps_per_second": 10.925,
3158
+ "step": 787725
3159
+ },
3160
+ {
3161
+ "epoch": 90.39,
3162
+ "learning_rate": 3.192103278875814e-05,
3163
+ "loss": 0.8685,
3164
+ "step": 791226
3165
+ },
3166
+ {
3167
+ "epoch": 90.39,
3168
+ "eval_loss": 0.8818721175193787,
3169
+ "eval_runtime": 177.85,
3170
+ "eval_samples_per_second": 174.979,
3171
+ "eval_steps_per_second": 10.936,
3172
+ "step": 791226
3173
+ },
3174
+ {
3175
+ "epoch": 90.79,
3176
+ "learning_rate": 3.18410373586199e-05,
3177
+ "loss": 0.8686,
3178
+ "step": 794727
3179
+ },
3180
+ {
3181
+ "epoch": 90.79,
3182
+ "eval_loss": 0.878167450428009,
3183
+ "eval_runtime": 177.8069,
3184
+ "eval_samples_per_second": 175.021,
3185
+ "eval_steps_per_second": 10.939,
3186
+ "step": 794727
3187
+ },
3188
+ {
3189
+ "epoch": 91.19,
3190
+ "learning_rate": 3.1761041928481665e-05,
3191
+ "loss": 0.8692,
3192
+ "step": 798228
3193
+ },
3194
+ {
3195
+ "epoch": 91.19,
3196
+ "eval_loss": 0.8786768913269043,
3197
+ "eval_runtime": 177.7581,
3198
+ "eval_samples_per_second": 175.069,
3199
+ "eval_steps_per_second": 10.942,
3200
+ "step": 798228
3201
+ },
3202
+ {
3203
+ "epoch": 91.59,
3204
+ "learning_rate": 3.168104649834343e-05,
3205
+ "loss": 0.8649,
3206
+ "step": 801729
3207
+ },
3208
+ {
3209
+ "epoch": 91.59,
3210
+ "eval_loss": 0.8777763247489929,
3211
+ "eval_runtime": 178.0586,
3212
+ "eval_samples_per_second": 174.774,
3213
+ "eval_steps_per_second": 10.923,
3214
+ "step": 801729
3215
+ },
3216
+ {
3217
+ "epoch": 91.99,
3218
+ "learning_rate": 3.160105106820519e-05,
3219
+ "loss": 0.8699,
3220
+ "step": 805230
3221
+ },
3222
+ {
3223
+ "epoch": 91.99,
3224
+ "eval_loss": 0.8764858841896057,
3225
+ "eval_runtime": 178.3307,
3226
+ "eval_samples_per_second": 174.507,
3227
+ "eval_steps_per_second": 10.907,
3228
+ "step": 805230
3229
+ },
3230
+ {
3231
+ "epoch": 92.39,
3232
+ "learning_rate": 3.152105563806695e-05,
3233
+ "loss": 0.8734,
3234
+ "step": 808731
3235
+ },
3236
+ {
3237
+ "epoch": 92.39,
3238
+ "eval_loss": 0.8796523809432983,
3239
+ "eval_runtime": 178.1053,
3240
+ "eval_samples_per_second": 174.728,
3241
+ "eval_steps_per_second": 10.921,
3242
+ "step": 808731
3243
+ },
3244
+ {
3245
+ "epoch": 92.79,
3246
+ "learning_rate": 3.1441060207928716e-05,
3247
+ "loss": 0.8762,
3248
+ "step": 812232
3249
+ },
3250
+ {
3251
+ "epoch": 92.79,
3252
+ "eval_loss": 0.873812198638916,
3253
+ "eval_runtime": 177.9129,
3254
+ "eval_samples_per_second": 174.917,
3255
+ "eval_steps_per_second": 10.932,
3256
+ "step": 812232
3257
+ },
3258
+ {
3259
+ "epoch": 93.19,
3260
+ "learning_rate": 3.136106477779047e-05,
3261
+ "loss": 0.875,
3262
+ "step": 815733
3263
+ },
3264
+ {
3265
+ "epoch": 93.19,
3266
+ "eval_loss": 0.8739504814147949,
3267
+ "eval_runtime": 178.3464,
3268
+ "eval_samples_per_second": 174.492,
3269
+ "eval_steps_per_second": 10.906,
3270
+ "step": 815733
3271
+ },
3272
+ {
3273
+ "epoch": 93.59,
3274
+ "learning_rate": 3.1281069347652235e-05,
3275
+ "loss": 0.8749,
3276
+ "step": 819234
3277
+ },
3278
+ {
3279
+ "epoch": 93.59,
3280
+ "eval_loss": 0.877826988697052,
3281
+ "eval_runtime": 177.9396,
3282
+ "eval_samples_per_second": 174.891,
3283
+ "eval_steps_per_second": 10.931,
3284
+ "step": 819234
3285
+ },
3286
+ {
3287
+ "epoch": 93.99,
3288
+ "learning_rate": 3.1201073917514e-05,
3289
+ "loss": 0.8766,
3290
+ "step": 822735
3291
+ },
3292
+ {
3293
+ "epoch": 93.99,
3294
+ "eval_loss": 0.8729666471481323,
3295
+ "eval_runtime": 178.2108,
3296
+ "eval_samples_per_second": 174.625,
3297
+ "eval_steps_per_second": 10.914,
3298
+ "step": 822735
3299
+ },
3300
+ {
3301
+ "epoch": 94.39,
3302
+ "learning_rate": 3.112107848737576e-05,
3303
+ "loss": 0.8726,
3304
+ "step": 826236
3305
+ },
3306
+ {
3307
+ "epoch": 94.39,
3308
+ "eval_loss": 0.8738223910331726,
3309
+ "eval_runtime": 177.8319,
3310
+ "eval_samples_per_second": 174.997,
3311
+ "eval_steps_per_second": 10.937,
3312
+ "step": 826236
3313
+ },
3314
+ {
3315
+ "epoch": 94.79,
3316
+ "learning_rate": 3.1041083057237516e-05,
3317
+ "loss": 0.8752,
3318
+ "step": 829737
3319
+ },
3320
+ {
3321
+ "epoch": 94.79,
3322
+ "eval_loss": 0.8740049600601196,
3323
+ "eval_runtime": 178.0665,
3324
+ "eval_samples_per_second": 174.766,
3325
+ "eval_steps_per_second": 10.923,
3326
+ "step": 829737
3327
+ },
3328
+ {
3329
+ "epoch": 95.19,
3330
+ "learning_rate": 3.0961087627099286e-05,
3331
+ "loss": 0.8737,
3332
+ "step": 833238
3333
+ },
3334
+ {
3335
+ "epoch": 95.19,
3336
+ "eval_loss": 0.871174693107605,
3337
+ "eval_runtime": 177.988,
3338
+ "eval_samples_per_second": 174.843,
3339
+ "eval_steps_per_second": 10.928,
3340
+ "step": 833238
3341
+ },
3342
+ {
3343
+ "epoch": 95.59,
3344
+ "learning_rate": 3.088109219696104e-05,
3345
+ "loss": 0.8723,
3346
+ "step": 836739
3347
+ },
3348
+ {
3349
+ "epoch": 95.59,
3350
+ "eval_loss": 0.8785658478736877,
3351
+ "eval_runtime": 178.1263,
3352
+ "eval_samples_per_second": 174.707,
3353
+ "eval_steps_per_second": 10.919,
3354
+ "step": 836739
3355
+ },
3356
+ {
3357
+ "epoch": 95.99,
3358
+ "learning_rate": 3.0801096766822805e-05,
3359
+ "loss": 0.8749,
3360
+ "step": 840240
3361
+ },
3362
+ {
3363
+ "epoch": 95.99,
3364
+ "eval_loss": 0.8714969158172607,
3365
+ "eval_runtime": 178.1533,
3366
+ "eval_samples_per_second": 174.681,
3367
+ "eval_steps_per_second": 10.918,
3368
+ "step": 840240
3369
+ },
3370
+ {
3371
+ "epoch": 96.39,
3372
+ "learning_rate": 3.072110133668457e-05,
3373
+ "loss": 0.8707,
3374
+ "step": 843741
3375
+ },
3376
+ {
3377
+ "epoch": 96.39,
3378
+ "eval_loss": 0.8721190094947815,
3379
+ "eval_runtime": 177.9697,
3380
+ "eval_samples_per_second": 174.861,
3381
+ "eval_steps_per_second": 10.929,
3382
+ "step": 843741
3383
+ },
3384
+ {
3385
+ "epoch": 96.79,
3386
+ "learning_rate": 3.064110590654633e-05,
3387
+ "loss": 0.8718,
3388
+ "step": 847242
3389
+ },
3390
+ {
3391
+ "epoch": 96.79,
3392
+ "eval_loss": 0.8713057637214661,
3393
+ "eval_runtime": 178.2884,
3394
+ "eval_samples_per_second": 174.549,
3395
+ "eval_steps_per_second": 10.909,
3396
+ "step": 847242
3397
+ },
3398
+ {
3399
+ "epoch": 97.19,
3400
+ "learning_rate": 3.0561110476408086e-05,
3401
+ "loss": 0.8731,
3402
+ "step": 850743
3403
+ },
3404
+ {
3405
+ "epoch": 97.19,
3406
+ "eval_loss": 0.8710973858833313,
3407
+ "eval_runtime": 178.0006,
3408
+ "eval_samples_per_second": 174.831,
3409
+ "eval_steps_per_second": 10.927,
3410
+ "step": 850743
3411
+ },
3412
+ {
3413
+ "epoch": 97.59,
3414
+ "learning_rate": 3.0481115046269852e-05,
3415
+ "loss": 0.872,
3416
+ "step": 854244
3417
+ },
3418
+ {
3419
+ "epoch": 97.59,
3420
+ "eval_loss": 0.8724552989006042,
3421
+ "eval_runtime": 177.9577,
3422
+ "eval_samples_per_second": 174.873,
3423
+ "eval_steps_per_second": 10.93,
3424
+ "step": 854244
3425
+ },
3426
+ {
3427
+ "epoch": 97.99,
3428
+ "learning_rate": 3.040111961613161e-05,
3429
+ "loss": 0.872,
3430
+ "step": 857745
3431
+ },
3432
+ {
3433
+ "epoch": 97.99,
3434
+ "eval_loss": 0.8701831698417664,
3435
+ "eval_runtime": 177.9492,
3436
+ "eval_samples_per_second": 174.881,
3437
+ "eval_steps_per_second": 10.93,
3438
+ "step": 857745
3439
+ },
3440
+ {
3441
+ "epoch": 98.39,
3442
+ "learning_rate": 3.0321124185993377e-05,
3443
+ "loss": 0.8688,
3444
+ "step": 861246
3445
+ },
3446
+ {
3447
+ "epoch": 98.39,
3448
+ "eval_loss": 0.868754506111145,
3449
+ "eval_runtime": 178.1695,
3450
+ "eval_samples_per_second": 174.665,
3451
+ "eval_steps_per_second": 10.917,
3452
+ "step": 861246
3453
+ },
3454
+ {
3455
+ "epoch": 98.79,
3456
+ "learning_rate": 3.0241128755855137e-05,
3457
+ "loss": 0.8712,
3458
+ "step": 864747
3459
+ },
3460
+ {
3461
+ "epoch": 98.79,
3462
+ "eval_loss": 0.8745043873786926,
3463
+ "eval_runtime": 178.0858,
3464
+ "eval_samples_per_second": 174.747,
3465
+ "eval_steps_per_second": 10.922,
3466
+ "step": 864747
3467
+ },
3468
+ {
3469
+ "epoch": 99.19,
3470
+ "learning_rate": 3.01611333257169e-05,
3471
+ "loss": 0.8691,
3472
+ "step": 868248
3473
+ },
3474
+ {
3475
+ "epoch": 99.19,
3476
+ "eval_loss": 0.8719183802604675,
3477
+ "eval_runtime": 177.9082,
3478
+ "eval_samples_per_second": 174.922,
3479
+ "eval_steps_per_second": 10.933,
3480
+ "step": 868248
3481
+ },
3482
+ {
3483
+ "epoch": 99.59,
3484
+ "learning_rate": 3.008113789557866e-05,
3485
+ "loss": 0.8693,
3486
+ "step": 871749
3487
+ },
3488
+ {
3489
+ "epoch": 99.59,
3490
+ "eval_loss": 0.8701997399330139,
3491
+ "eval_runtime": 177.8864,
3492
+ "eval_samples_per_second": 174.943,
3493
+ "eval_steps_per_second": 10.934,
3494
+ "step": 871749
3495
+ },
3496
+ {
3497
+ "epoch": 99.99,
3498
+ "learning_rate": 3.0001142465440425e-05,
3499
+ "loss": 0.8692,
3500
+ "step": 875250
3501
+ },
3502
+ {
3503
+ "epoch": 99.99,
3504
+ "eval_loss": 0.8659647107124329,
3505
+ "eval_runtime": 177.9913,
3506
+ "eval_samples_per_second": 174.84,
3507
+ "eval_steps_per_second": 10.927,
3508
+ "step": 875250
3509
+ },
3510
+ {
3511
+ "epoch": 100.39,
3512
+ "learning_rate": 2.9921147035302184e-05,
3513
+ "loss": 0.8646,
3514
+ "step": 878751
3515
+ },
3516
+ {
3517
+ "epoch": 100.39,
3518
+ "eval_loss": 0.86882084608078,
3519
+ "eval_runtime": 177.9376,
3520
+ "eval_samples_per_second": 174.893,
3521
+ "eval_steps_per_second": 10.931,
3522
+ "step": 878751
3523
+ },
3524
+ {
3525
+ "epoch": 100.79,
3526
+ "learning_rate": 2.9841151605163947e-05,
3527
+ "loss": 0.8698,
3528
+ "step": 882252
3529
+ },
3530
+ {
3531
+ "epoch": 100.79,
3532
+ "eval_loss": 0.8718056082725525,
3533
+ "eval_runtime": 178.2288,
3534
+ "eval_samples_per_second": 174.607,
3535
+ "eval_steps_per_second": 10.913,
3536
+ "step": 882252
3537
+ },
3538
+ {
3539
+ "epoch": 101.19,
3540
+ "learning_rate": 2.9761156175025706e-05,
3541
+ "loss": 0.8675,
3542
+ "step": 885753
3543
+ },
3544
+ {
3545
+ "epoch": 101.19,
3546
+ "eval_loss": 0.8696756362915039,
3547
+ "eval_runtime": 178.0145,
3548
+ "eval_samples_per_second": 174.817,
3549
+ "eval_steps_per_second": 10.926,
3550
+ "step": 885753
3551
+ },
3552
+ {
3553
+ "epoch": 101.59,
3554
+ "learning_rate": 2.968116074488747e-05,
3555
+ "loss": 0.8668,
3556
+ "step": 889254
3557
+ },
3558
+ {
3559
+ "epoch": 101.59,
3560
+ "eval_loss": 0.8669666051864624,
3561
+ "eval_runtime": 177.9105,
3562
+ "eval_samples_per_second": 174.919,
3563
+ "eval_steps_per_second": 10.932,
3564
+ "step": 889254
3565
+ },
3566
+ {
3567
+ "epoch": 101.99,
3568
+ "learning_rate": 2.960116531474923e-05,
3569
+ "loss": 0.8676,
3570
+ "step": 892755
3571
+ },
3572
+ {
3573
+ "epoch": 101.99,
3574
+ "eval_loss": 0.8685517311096191,
3575
+ "eval_runtime": 178.0845,
3576
+ "eval_samples_per_second": 174.749,
3577
+ "eval_steps_per_second": 10.922,
3578
+ "step": 892755
3579
+ },
3580
+ {
3581
+ "epoch": 102.39,
3582
+ "learning_rate": 2.9521169884610994e-05,
3583
+ "loss": 0.8645,
3584
+ "step": 896256
3585
+ },
3586
+ {
3587
+ "epoch": 102.39,
3588
+ "eval_loss": 0.8676366209983826,
3589
+ "eval_runtime": 177.8814,
3590
+ "eval_samples_per_second": 174.948,
3591
+ "eval_steps_per_second": 10.934,
3592
+ "step": 896256
3593
+ },
3594
+ {
3595
+ "epoch": 102.79,
3596
+ "learning_rate": 2.9441174454472754e-05,
3597
+ "loss": 0.8652,
3598
+ "step": 899757
3599
+ },
3600
+ {
3601
+ "epoch": 102.79,
3602
+ "eval_loss": 0.8648022413253784,
3603
+ "eval_runtime": 177.9084,
3604
+ "eval_samples_per_second": 174.921,
3605
+ "eval_steps_per_second": 10.933,
3606
+ "step": 899757
3607
+ },
3608
+ {
3609
+ "epoch": 103.19,
3610
+ "learning_rate": 2.9361179024334517e-05,
3611
+ "loss": 0.8646,
3612
+ "step": 903258
3613
+ },
3614
+ {
3615
+ "epoch": 103.19,
3616
+ "eval_loss": 0.8692647814750671,
3617
+ "eval_runtime": 177.9213,
3618
+ "eval_samples_per_second": 174.909,
3619
+ "eval_steps_per_second": 10.932,
3620
+ "step": 903258
3621
+ },
3622
+ {
3623
+ "epoch": 103.59,
3624
+ "learning_rate": 2.9281183594196276e-05,
3625
+ "loss": 0.8634,
3626
+ "step": 906759
3627
+ },
3628
+ {
3629
+ "epoch": 103.59,
3630
+ "eval_loss": 0.8666937351226807,
3631
+ "eval_runtime": 178.0016,
3632
+ "eval_samples_per_second": 174.83,
3633
+ "eval_steps_per_second": 10.927,
3634
+ "step": 906759
3635
+ },
3636
+ {
3637
+ "epoch": 103.99,
3638
+ "learning_rate": 2.9201188164058042e-05,
3639
+ "loss": 0.8678,
3640
+ "step": 910260
3641
+ },
3642
+ {
3643
+ "epoch": 103.99,
3644
+ "eval_loss": 0.8698520660400391,
3645
+ "eval_runtime": 178.1156,
3646
+ "eval_samples_per_second": 174.718,
3647
+ "eval_steps_per_second": 10.92,
3648
+ "step": 910260
3649
+ },
3650
+ {
3651
+ "epoch": 104.39,
3652
+ "learning_rate": 2.91211927339198e-05,
3653
+ "loss": 0.8616,
3654
+ "step": 913761
3655
+ },
3656
+ {
3657
+ "epoch": 104.39,
3658
+ "eval_loss": 0.864092230796814,
3659
+ "eval_runtime": 177.9776,
3660
+ "eval_samples_per_second": 174.853,
3661
+ "eval_steps_per_second": 10.928,
3662
+ "step": 913761
3663
+ },
3664
+ {
3665
+ "epoch": 104.79,
3666
+ "learning_rate": 2.9041197303781564e-05,
3667
+ "loss": 0.8631,
3668
+ "step": 917262
3669
+ },
3670
+ {
3671
+ "epoch": 104.79,
3672
+ "eval_loss": 0.8643292188644409,
3673
+ "eval_runtime": 178.3388,
3674
+ "eval_samples_per_second": 174.499,
3675
+ "eval_steps_per_second": 10.906,
3676
+ "step": 917262
3677
+ },
3678
+ {
3679
+ "epoch": 105.19,
3680
+ "learning_rate": 2.8961201873643323e-05,
3681
+ "loss": 0.8643,
3682
+ "step": 920763
3683
+ },
3684
+ {
3685
+ "epoch": 105.19,
3686
+ "eval_loss": 0.867784857749939,
3687
+ "eval_runtime": 178.0618,
3688
+ "eval_samples_per_second": 174.771,
3689
+ "eval_steps_per_second": 10.923,
3690
+ "step": 920763
3691
+ },
3692
+ {
3693
+ "epoch": 105.59,
3694
+ "learning_rate": 2.8881206443505086e-05,
3695
+ "loss": 0.8616,
3696
+ "step": 924264
3697
+ },
3698
+ {
3699
+ "epoch": 105.59,
3700
+ "eval_loss": 0.8667683601379395,
3701
+ "eval_runtime": 177.9577,
3702
+ "eval_samples_per_second": 174.873,
3703
+ "eval_steps_per_second": 10.93,
3704
+ "step": 924264
3705
+ },
3706
+ {
3707
+ "epoch": 105.99,
3708
+ "learning_rate": 2.8801211013366845e-05,
3709
+ "loss": 0.8644,
3710
+ "step": 927765
3711
+ },
3712
+ {
3713
+ "epoch": 105.99,
3714
+ "eval_loss": 0.8647730946540833,
3715
+ "eval_runtime": 178.1529,
3716
+ "eval_samples_per_second": 174.681,
3717
+ "eval_steps_per_second": 10.918,
3718
+ "step": 927765
3719
+ },
3720
+ {
3721
+ "epoch": 106.39,
3722
+ "learning_rate": 2.872121558322861e-05,
3723
+ "loss": 0.859,
3724
+ "step": 931266
3725
+ },
3726
+ {
3727
+ "epoch": 106.39,
3728
+ "eval_loss": 0.8621995449066162,
3729
+ "eval_runtime": 178.0998,
3730
+ "eval_samples_per_second": 174.733,
3731
+ "eval_steps_per_second": 10.921,
3732
+ "step": 931266
3733
+ },
3734
+ {
3735
+ "epoch": 106.79,
3736
+ "learning_rate": 2.864122015309037e-05,
3737
+ "loss": 0.8611,
3738
+ "step": 934767
3739
+ },
3740
+ {
3741
+ "epoch": 106.79,
3742
+ "eval_loss": 0.8664916753768921,
3743
+ "eval_runtime": 178.2296,
3744
+ "eval_samples_per_second": 174.606,
3745
+ "eval_steps_per_second": 10.913,
3746
+ "step": 934767
3747
+ },
3748
+ {
3749
+ "epoch": 107.19,
3750
+ "learning_rate": 2.8561224722952134e-05,
3751
+ "loss": 0.8608,
3752
+ "step": 938268
3753
+ },
3754
+ {
3755
+ "epoch": 107.19,
3756
+ "eval_loss": 0.8675068020820618,
3757
+ "eval_runtime": 178.0022,
3758
+ "eval_samples_per_second": 174.829,
3759
+ "eval_steps_per_second": 10.927,
3760
+ "step": 938268
3761
+ },
3762
+ {
3763
+ "epoch": 107.59,
3764
+ "learning_rate": 2.8481229292813893e-05,
3765
+ "loss": 0.8596,
3766
+ "step": 941769
3767
+ },
3768
+ {
3769
+ "epoch": 107.59,
3770
+ "eval_loss": 0.8647910356521606,
3771
+ "eval_runtime": 177.9926,
3772
+ "eval_samples_per_second": 174.839,
3773
+ "eval_steps_per_second": 10.927,
3774
+ "step": 941769
3775
+ },
3776
+ {
3777
+ "epoch": 107.99,
3778
+ "learning_rate": 2.840123386267566e-05,
3779
+ "loss": 0.863,
3780
+ "step": 945270
3781
+ },
3782
+ {
3783
+ "epoch": 107.99,
3784
+ "eval_loss": 0.8669795393943787,
3785
+ "eval_runtime": 177.9511,
3786
+ "eval_samples_per_second": 174.879,
3787
+ "eval_steps_per_second": 10.93,
3788
+ "step": 945270
3789
+ },
3790
+ {
3791
+ "epoch": 108.39,
3792
+ "learning_rate": 2.832123843253742e-05,
3793
+ "loss": 0.8589,
3794
+ "step": 948771
3795
+ },
3796
+ {
3797
+ "epoch": 108.39,
3798
+ "eval_loss": 0.8626872301101685,
3799
+ "eval_runtime": 178.2769,
3800
+ "eval_samples_per_second": 174.56,
3801
+ "eval_steps_per_second": 10.91,
3802
+ "step": 948771
3803
+ },
3804
+ {
3805
+ "epoch": 108.79,
3806
+ "learning_rate": 2.824124300239918e-05,
3807
+ "loss": 0.8605,
3808
+ "step": 952272
3809
+ },
3810
+ {
3811
+ "epoch": 108.79,
3812
+ "eval_loss": 0.8620831370353699,
3813
+ "eval_runtime": 177.8108,
3814
+ "eval_samples_per_second": 175.018,
3815
+ "eval_steps_per_second": 10.939,
3816
+ "step": 952272
3817
+ },
3818
+ {
3819
+ "epoch": 109.19,
3820
+ "learning_rate": 2.816124757226094e-05,
3821
+ "loss": 0.8578,
3822
+ "step": 955773
3823
+ },
3824
+ {
3825
+ "epoch": 109.19,
3826
+ "eval_loss": 0.8637415170669556,
3827
+ "eval_runtime": 177.6547,
3828
+ "eval_samples_per_second": 175.171,
3829
+ "eval_steps_per_second": 10.948,
3830
+ "step": 955773
3831
+ },
3832
+ {
3833
+ "epoch": 109.59,
3834
+ "learning_rate": 2.8081252142122706e-05,
3835
+ "loss": 0.8594,
3836
+ "step": 959274
3837
+ },
3838
+ {
3839
+ "epoch": 109.59,
3840
+ "eval_loss": 0.8635110855102539,
3841
+ "eval_runtime": 177.7697,
3842
+ "eval_samples_per_second": 175.058,
3843
+ "eval_steps_per_second": 10.941,
3844
+ "step": 959274
3845
+ },
3846
+ {
3847
+ "epoch": 109.99,
3848
+ "learning_rate": 2.8001256711984462e-05,
3849
+ "loss": 0.8619,
3850
+ "step": 962775
3851
+ },
3852
+ {
3853
+ "epoch": 109.99,
3854
+ "eval_loss": 0.8643426299095154,
3855
+ "eval_runtime": 177.7171,
3856
+ "eval_samples_per_second": 175.11,
3857
+ "eval_steps_per_second": 10.944,
3858
+ "step": 962775
3859
+ }
3860
+ ],
3861
+ "max_steps": 2188250,
3862
+ "num_train_epochs": 250,
3863
+ "total_flos": 8.10959302547497e+18,
3864
+ "trial_name": null,
3865
+ "trial_params": null
3866
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7910258e830319e345a2ecf41072cb3fbe6e5359e308e9a5856c1068f6a34409
3
+ size 3055