Ogamon commited on
Commit
3290319
1 Parent(s): cb26ffe

second commit

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.951768488745981,
3
- "num_input_tokens_seen": 6318560,
4
- "total_flos": 2.5049338265940787e+17,
5
- "train_loss": 0.9364173337708336,
6
- "train_runtime": 4331.1035,
7
- "train_samples_per_second": 22.95,
8
- "train_steps_per_second": 0.089
9
  }
 
1
  {
2
+ "predict_bleu-4": 85.30970250000001,
3
+ "predict_rouge-1": 92.890625,
4
+ "predict_rouge-2": 0.0,
5
+ "predict_rouge-l": 92.890625,
6
+ "predict_runtime": 17.8088,
7
+ "predict_samples_per_second": 143.412,
8
+ "predict_steps_per_second": 8.984
9
  }
generated_predictions.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llamaboard_config.yaml CHANGED
@@ -1,5 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
1
  top.booster: auto
2
- top.checkpoint_path: null
3
  top.finetuning_type: full
4
  top.model_name: LLaMA2-7B-Chat
5
  top.quantization_bit: none
@@ -7,59 +18,3 @@ top.quantization_method: bitsandbytes
7
  top.rope_scaling: none
8
  top.template: llama2
9
  top.visual_inputs: false
10
- train.additional_target: ''
11
- train.badam_mode: layer
12
- train.badam_switch_interval: 50
13
- train.badam_switch_mode: ascending
14
- train.badam_update_ratio: 0.05
15
- train.batch_size: 4
16
- train.compute_type: bf16
17
- train.create_new_adapter: false
18
- train.cutoff_len: 1024
19
- train.dataset:
20
- - truth_train
21
- train.dataset_dir: data
22
- train.ds_offload: false
23
- train.ds_stage: '2'
24
- train.freeze_extra_modules: ''
25
- train.freeze_trainable_layers: 2
26
- train.freeze_trainable_modules: all
27
- train.galore_rank: 16
28
- train.galore_scale: 0.25
29
- train.galore_target: all
30
- train.galore_update_interval: 200
31
- train.gradient_accumulation_steps: 8
32
- train.learning_rate: 5e-6
33
- train.logging_steps: 1
34
- train.lora_alpha: 16
35
- train.lora_dropout: 0
36
- train.lora_rank: 8
37
- train.lora_target: ''
38
- train.loraplus_lr_ratio: 0
39
- train.lr_scheduler_type: cosine
40
- train.max_grad_norm: '1.0'
41
- train.max_samples: '100000'
42
- train.neat_packing: false
43
- train.neftune_alpha: 0
44
- train.num_train_epochs: '5.0'
45
- train.optim: adamw_torch
46
- train.packing: false
47
- train.ppo_score_norm: false
48
- train.ppo_whiten_rewards: false
49
- train.pref_beta: 0.1
50
- train.pref_ftx: 0
51
- train.pref_loss: sigmoid
52
- train.report_to: false
53
- train.resize_vocab: false
54
- train.reward_model: null
55
- train.save_steps: 980
56
- train.shift_attn: false
57
- train.training_stage: Supervised Fine-Tuning
58
- train.use_badam: false
59
- train.use_dora: false
60
- train.use_galore: false
61
- train.use_llama_pro: false
62
- train.use_pissa: false
63
- train.use_rslora: false
64
- train.val_size: 0
65
- train.warmup_steps: 600
 
1
+ eval.batch_size: 2
2
+ eval.cutoff_len: 1024
3
+ eval.dataset:
4
+ - truth_dev
5
+ eval.dataset_dir: data
6
+ eval.max_new_tokens: 512
7
+ eval.max_samples: '100000'
8
+ eval.output_dir: eval_2024-07-11-10-49-45
9
+ eval.predict: true
10
+ eval.temperature: 0.95
11
+ eval.top_p: 0.7
12
  top.booster: auto
13
+ top.checkpoint_path: train_2024-07-11-09-30-54_llama2_inst_truth
14
  top.finetuning_type: full
15
  top.model_name: LLaMA2-7B-Chat
16
  top.quantization_bit: none
 
18
  top.rope_scaling: none
19
  top.template: llama2
20
  top.visual_inputs: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
predict_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "predict_bleu-4": 85.30970250000001,
3
+ "predict_rouge-1": 92.890625,
4
+ "predict_rouge-2": 0.0,
5
+ "predict_rouge-l": 92.890625,
6
+ "predict_runtime": 17.8088,
7
+ "predict_samples_per_second": 143.412,
8
+ "predict_steps_per_second": 8.984
9
+ }
running_log.txt CHANGED
@@ -1,65 +1,45 @@
1
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
2
 
3
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
4
 
5
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 7, device: cuda:7, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
6
 
7
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
8
 
9
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
10
 
11
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
12
 
13
- 07/11/2024 09:33:21 - INFO - llamafactory.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
14
 
15
- [INFO|parser.py:325] 2024-07-11 09:33:21,701 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16
16
 
17
- [INFO|tokenization_utils_base.py:2161] 2024-07-11 09:33:24,201 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/tokenizer.model
18
 
19
- [INFO|tokenization_utils_base.py:2161] 2024-07-11 09:33:24,201 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/tokenizer.json
20
 
21
- [INFO|tokenization_utils_base.py:2161] 2024-07-11 09:33:24,201 >> loading file added_tokens.json from cache at None
22
 
23
- [INFO|tokenization_utils_base.py:2161] 2024-07-11 09:33:24,201 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/special_tokens_map.json
24
 
25
- [INFO|tokenization_utils_base.py:2161] 2024-07-11 09:33:24,202 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/tokenizer_config.json
26
 
27
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
28
 
29
- [INFO|template.py:372] 2024-07-11 09:33:24,309 >> Add pad token: </s>
30
 
31
- [INFO|loader.py:50] 2024-07-11 09:33:24,310 >> Loading dataset train_output.json...
32
 
33
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
34
 
35
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
36
 
37
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
38
 
39
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
40
 
41
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
42
-
43
- 07/11/2024 09:33:24 - INFO - llamafactory.data.template - Add pad token: </s>
44
-
45
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
46
-
47
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
48
-
49
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
50
-
51
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
52
-
53
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
54
-
55
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
56
-
57
- 07/11/2024 09:33:25 - INFO - llamafactory.data.loader - Loading dataset train_output.json...
58
-
59
- [INFO|configuration_utils.py:733] 2024-07-11 09:33:27,545 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/config.json
60
-
61
- [INFO|configuration_utils.py:800] 2024-07-11 09:33:27,546 >> Model config LlamaConfig {
62
- "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
63
  "architectures": [
64
  "LlamaForCausalLM"
65
  ],
@@ -82,944 +62,112 @@
82
  "rope_scaling": null,
83
  "rope_theta": 10000.0,
84
  "tie_word_embeddings": false,
85
- "torch_dtype": "float16",
86
  "transformers_version": "4.42.3",
87
- "use_cache": true,
88
  "vocab_size": 32000
89
  }
90
 
91
 
92
- [INFO|modeling_utils.py:3556] 2024-07-11 09:33:28,355 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/model.safetensors.index.json
93
-
94
- [INFO|modeling_utils.py:1531] 2024-07-11 09:35:30,794 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
95
-
96
- [INFO|configuration_utils.py:1000] 2024-07-11 09:35:30,799 >> Generate config GenerationConfig {
97
- "bos_token_id": 1,
98
- "eos_token_id": 2
99
- }
100
-
101
-
102
- [INFO|modeling_utils.py:4364] 2024-07-11 09:35:47,952 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
103
 
 
104
 
105
- [INFO|modeling_utils.py:4372] 2024-07-11 09:35:47,952 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf.
106
- If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
107
 
108
- [INFO|configuration_utils.py:955] 2024-07-11 09:35:48,301 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/generation_config.json
109
-
110
- [INFO|configuration_utils.py:1000] 2024-07-11 09:35:48,301 >> Generate config GenerationConfig {
111
  "bos_token_id": 1,
112
- "do_sample": true,
113
- "eos_token_id": 2,
114
- "max_length": 4096,
115
- "pad_token_id": 0,
116
- "temperature": 0.6,
117
- "top_p": 0.9
118
  }
119
 
120
 
121
- [INFO|checkpointing.py:103] 2024-07-11 09:35:48,308 >> Gradient checkpointing enabled.
122
-
123
- [INFO|attention.py:80] 2024-07-11 09:35:48,308 >> Using torch SDPA for faster training and inference.
124
-
125
- [INFO|adapter.py:302] 2024-07-11 09:35:48,309 >> Upcasting trainable params to float32.
126
-
127
- [INFO|adapter.py:48] 2024-07-11 09:35:48,309 >> Fine-tuning method: Full
128
-
129
- [INFO|loader.py:196] 2024-07-11 09:35:48,360 >> trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
130
-
131
- [INFO|trainer.py:642] 2024-07-11 09:35:48,366 >> Using auto half precision backend
132
-
133
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
134
-
135
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
136
-
137
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
138
-
139
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
140
-
141
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
142
-
143
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
144
-
145
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
146
-
147
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
148
-
149
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
150
-
151
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
152
-
153
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
154
-
155
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
156
-
157
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
158
-
159
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
160
-
161
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
162
-
163
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
164
-
165
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
166
-
167
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
168
-
169
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
170
-
171
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
172
-
173
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
174
-
175
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
176
-
177
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
178
-
179
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
180
-
181
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
182
-
183
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
184
-
185
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
186
-
187
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
188
-
189
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
190
-
191
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
192
-
193
- 07/11/2024 09:35:49 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
194
-
195
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
196
-
197
- 07/11/2024 09:35:49 - INFO - llamafactory.model.adapter - Fine-tuning method: Full
198
-
199
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
200
-
201
- 07/11/2024 09:35:49 - INFO - llamafactory.model.loader - trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.0000
202
-
203
- [INFO|trainer.py:2128] 2024-07-11 09:36:08,514 >> ***** Running training *****
204
-
205
- [INFO|trainer.py:2129] 2024-07-11 09:36:08,514 >> Num examples = 19,880
206
-
207
- [INFO|trainer.py:2130] 2024-07-11 09:36:08,514 >> Num Epochs = 5
208
-
209
- [INFO|trainer.py:2131] 2024-07-11 09:36:08,514 >> Instantaneous batch size per device = 4
210
-
211
- [INFO|trainer.py:2134] 2024-07-11 09:36:08,514 >> Total train batch size (w. parallel, distributed & accumulation) = 256
212
-
213
- [INFO|trainer.py:2135] 2024-07-11 09:36:08,514 >> Gradient Accumulation steps = 8
214
-
215
- [INFO|trainer.py:2136] 2024-07-11 09:36:08,514 >> Total optimization steps = 385
216
-
217
- [INFO|trainer.py:2137] 2024-07-11 09:36:08,516 >> Number of trainable parameters = 6,738,415,616
218
-
219
- [INFO|callbacks.py:310] 2024-07-11 09:36:20,236 >> {'loss': 8.1083, 'learning_rate': 8.3333e-09, 'epoch': 0.01, 'throughput': 1411.72}
220
-
221
- [INFO|callbacks.py:310] 2024-07-11 09:36:31,361 >> {'loss': 8.1052, 'learning_rate': 1.6667e-08, 'epoch': 0.03, 'throughput': 1455.41}
222
-
223
- [INFO|callbacks.py:310] 2024-07-11 09:36:42,439 >> {'loss': 8.1366, 'learning_rate': 2.5000e-08, 'epoch': 0.04, 'throughput': 1466.88}
224
-
225
- [INFO|callbacks.py:310] 2024-07-11 09:36:53,528 >> {'loss': 8.0413, 'learning_rate': 3.3333e-08, 'epoch': 0.05, 'throughput': 1462.41}
226
-
227
- [INFO|callbacks.py:310] 2024-07-11 09:37:04,628 >> {'loss': 8.1055, 'learning_rate': 4.1667e-08, 'epoch': 0.06, 'throughput': 1471.37}
228
-
229
- [INFO|callbacks.py:310] 2024-07-11 09:37:15,764 >> {'loss': 7.9666, 'learning_rate': 5.0000e-08, 'epoch': 0.08, 'throughput': 1484.19}
230
-
231
- [INFO|callbacks.py:310] 2024-07-11 09:37:26,881 >> {'loss': 8.0182, 'learning_rate': 5.8333e-08, 'epoch': 0.09, 'throughput': 1477.83}
232
-
233
- [INFO|callbacks.py:310] 2024-07-11 09:37:38,008 >> {'loss': 8.0363, 'learning_rate': 6.6667e-08, 'epoch': 0.10, 'throughput': 1468.21}
234
-
235
- [INFO|callbacks.py:310] 2024-07-11 09:37:49,153 >> {'loss': 8.0918, 'learning_rate': 7.5000e-08, 'epoch': 0.12, 'throughput': 1469.05}
236
-
237
- [INFO|callbacks.py:310] 2024-07-11 09:38:00,281 >> {'loss': 8.0197, 'learning_rate': 8.3333e-08, 'epoch': 0.13, 'throughput': 1467.95}
238
-
239
- [INFO|callbacks.py:310] 2024-07-11 09:38:11,405 >> {'loss': 7.9714, 'learning_rate': 9.1667e-08, 'epoch': 0.14, 'throughput': 1470.48}
240
-
241
- [INFO|callbacks.py:310] 2024-07-11 09:38:22,461 >> {'loss': 8.0131, 'learning_rate': 1.0000e-07, 'epoch': 0.15, 'throughput': 1463.77}
242
-
243
- [INFO|callbacks.py:310] 2024-07-11 09:38:33,540 >> {'loss': 7.9796, 'learning_rate': 1.0833e-07, 'epoch': 0.17, 'throughput': 1457.86}
244
-
245
- [INFO|callbacks.py:310] 2024-07-11 09:38:44,657 >> {'loss': 7.9999, 'learning_rate': 1.1667e-07, 'epoch': 0.18, 'throughput': 1460.84}
246
-
247
- [INFO|callbacks.py:310] 2024-07-11 09:38:55,796 >> {'loss': 8.0168, 'learning_rate': 1.2500e-07, 'epoch': 0.19, 'throughput': 1461.12}
248
-
249
- [INFO|callbacks.py:310] 2024-07-11 09:39:06,924 >> {'loss': 7.8423, 'learning_rate': 1.3333e-07, 'epoch': 0.21, 'throughput': 1463.26}
250
-
251
- [INFO|callbacks.py:310] 2024-07-11 09:39:18,061 >> {'loss': 7.9208, 'learning_rate': 1.4167e-07, 'epoch': 0.22, 'throughput': 1461.36}
252
-
253
- [INFO|callbacks.py:310] 2024-07-11 09:39:29,196 >> {'loss': 7.6898, 'learning_rate': 1.5000e-07, 'epoch': 0.23, 'throughput': 1462.71}
254
-
255
- [INFO|callbacks.py:310] 2024-07-11 09:39:40,310 >> {'loss': 7.9285, 'learning_rate': 1.5833e-07, 'epoch': 0.24, 'throughput': 1459.08}
256
-
257
- [INFO|callbacks.py:310] 2024-07-11 09:39:51,418 >> {'loss': 7.8924, 'learning_rate': 1.6667e-07, 'epoch': 0.26, 'throughput': 1458.15}
258
-
259
- [INFO|callbacks.py:310] 2024-07-11 09:40:02,525 >> {'loss': 7.6173, 'learning_rate': 1.7500e-07, 'epoch': 0.27, 'throughput': 1460.88}
260
-
261
- [INFO|callbacks.py:310] 2024-07-11 09:40:13,636 >> {'loss': 7.7661, 'learning_rate': 1.8333e-07, 'epoch': 0.28, 'throughput': 1463.97}
262
-
263
- [INFO|callbacks.py:310] 2024-07-11 09:40:24,739 >> {'loss': 7.2936, 'learning_rate': 1.9167e-07, 'epoch': 0.30, 'throughput': 1464.98}
264
-
265
- [INFO|callbacks.py:310] 2024-07-11 09:40:35,863 >> {'loss': 7.3044, 'learning_rate': 2.0000e-07, 'epoch': 0.31, 'throughput': 1468.54}
266
-
267
- [INFO|callbacks.py:310] 2024-07-11 09:40:46,970 >> {'loss': 7.3500, 'learning_rate': 2.0833e-07, 'epoch': 0.32, 'throughput': 1468.11}
268
-
269
- [INFO|callbacks.py:310] 2024-07-11 09:40:58,094 >> {'loss': 7.1213, 'learning_rate': 2.1667e-07, 'epoch': 0.33, 'throughput': 1469.84}
270
-
271
- [INFO|callbacks.py:310] 2024-07-11 09:41:09,205 >> {'loss': 7.1363, 'learning_rate': 2.2500e-07, 'epoch': 0.35, 'throughput': 1469.80}
272
-
273
- [INFO|callbacks.py:310] 2024-07-11 09:41:20,376 >> {'loss': 6.9016, 'learning_rate': 2.3333e-07, 'epoch': 0.36, 'throughput': 1468.87}
274
-
275
- [INFO|callbacks.py:310] 2024-07-11 09:41:31,471 >> {'loss': 6.9764, 'learning_rate': 2.4167e-07, 'epoch': 0.37, 'throughput': 1467.16}
276
-
277
- [INFO|callbacks.py:310] 2024-07-11 09:41:42,574 >> {'loss': 6.9005, 'learning_rate': 2.5000e-07, 'epoch': 0.39, 'throughput': 1467.82}
278
-
279
- [INFO|callbacks.py:310] 2024-07-11 09:41:53,697 >> {'loss': 6.9852, 'learning_rate': 2.5833e-07, 'epoch': 0.40, 'throughput': 1468.73}
280
-
281
- [INFO|callbacks.py:310] 2024-07-11 09:42:04,833 >> {'loss': 5.8340, 'learning_rate': 2.6667e-07, 'epoch': 0.41, 'throughput': 1469.26}
282
-
283
- [INFO|callbacks.py:310] 2024-07-11 09:42:15,965 >> {'loss': 5.5452, 'learning_rate': 2.7500e-07, 'epoch': 0.42, 'throughput': 1471.43}
284
-
285
- [INFO|callbacks.py:310] 2024-07-11 09:42:27,099 >> {'loss': 5.5189, 'learning_rate': 2.8333e-07, 'epoch': 0.44, 'throughput': 1473.12}
286
-
287
- [INFO|callbacks.py:310] 2024-07-11 09:42:38,222 >> {'loss': 5.4654, 'learning_rate': 2.9167e-07, 'epoch': 0.45, 'throughput': 1473.94}
288
-
289
- [INFO|callbacks.py:310] 2024-07-11 09:42:49,345 >> {'loss': 5.3117, 'learning_rate': 3.0000e-07, 'epoch': 0.46, 'throughput': 1472.23}
290
-
291
- [INFO|callbacks.py:310] 2024-07-11 09:43:00,464 >> {'loss': 5.2744, 'learning_rate': 3.0833e-07, 'epoch': 0.48, 'throughput': 1470.87}
292
-
293
- [INFO|callbacks.py:310] 2024-07-11 09:43:11,566 >> {'loss': 5.2821, 'learning_rate': 3.1667e-07, 'epoch': 0.49, 'throughput': 1470.77}
294
-
295
- [INFO|callbacks.py:310] 2024-07-11 09:43:22,656 >> {'loss': 5.0967, 'learning_rate': 3.2500e-07, 'epoch': 0.50, 'throughput': 1470.64}
296
-
297
- [INFO|callbacks.py:310] 2024-07-11 09:43:33,736 >> {'loss': 5.2167, 'learning_rate': 3.3333e-07, 'epoch': 0.51, 'throughput': 1470.41}
298
-
299
- [INFO|callbacks.py:310] 2024-07-11 09:43:44,884 >> {'loss': 5.1517, 'learning_rate': 3.4167e-07, 'epoch': 0.53, 'throughput': 1471.24}
300
-
301
- [INFO|callbacks.py:310] 2024-07-11 09:43:56,018 >> {'loss': 4.6959, 'learning_rate': 3.5000e-07, 'epoch': 0.54, 'throughput': 1471.59}
302
-
303
- [INFO|callbacks.py:310] 2024-07-11 09:44:07,144 >> {'loss': 4.1458, 'learning_rate': 3.5833e-07, 'epoch': 0.55, 'throughput': 1471.34}
304
-
305
- [INFO|callbacks.py:310] 2024-07-11 09:44:18,276 >> {'loss': 3.6692, 'learning_rate': 3.6667e-07, 'epoch': 0.57, 'throughput': 1471.55}
306
-
307
- [INFO|callbacks.py:310] 2024-07-11 09:44:29,413 >> {'loss': 3.3322, 'learning_rate': 3.7500e-07, 'epoch': 0.58, 'throughput': 1472.18}
308
-
309
- [INFO|callbacks.py:310] 2024-07-11 09:44:40,545 >> {'loss': 3.2857, 'learning_rate': 3.8333e-07, 'epoch': 0.59, 'throughput': 1472.04}
310
-
311
- [INFO|callbacks.py:310] 2024-07-11 09:44:51,649 >> {'loss': 3.0806, 'learning_rate': 3.9167e-07, 'epoch': 0.60, 'throughput': 1472.06}
312
-
313
- [INFO|callbacks.py:310] 2024-07-11 09:45:02,778 >> {'loss': 2.8239, 'learning_rate': 4.0000e-07, 'epoch': 0.62, 'throughput': 1472.90}
314
-
315
- [INFO|callbacks.py:310] 2024-07-11 09:45:13,907 >> {'loss': 2.8695, 'learning_rate': 4.0833e-07, 'epoch': 0.63, 'throughput': 1474.53}
316
-
317
- [INFO|callbacks.py:310] 2024-07-11 09:45:25,026 >> {'loss': 2.6218, 'learning_rate': 4.1667e-07, 'epoch': 0.64, 'throughput': 1475.14}
318
-
319
- [INFO|callbacks.py:310] 2024-07-11 09:45:36,130 >> {'loss': 2.3993, 'learning_rate': 4.2500e-07, 'epoch': 0.66, 'throughput': 1474.86}
320
-
321
- [INFO|callbacks.py:310] 2024-07-11 09:45:47,281 >> {'loss': 2.2828, 'learning_rate': 4.3333e-07, 'epoch': 0.67, 'throughput': 1474.81}
322
-
323
- [INFO|callbacks.py:310] 2024-07-11 09:45:58,412 >> {'loss': 2.2063, 'learning_rate': 4.4167e-07, 'epoch': 0.68, 'throughput': 1475.84}
324
-
325
- [INFO|callbacks.py:310] 2024-07-11 09:46:09,535 >> {'loss': 1.8878, 'learning_rate': 4.5000e-07, 'epoch': 0.69, 'throughput': 1474.30}
326
-
327
- [INFO|callbacks.py:310] 2024-07-11 09:46:20,652 >> {'loss': 1.5971, 'learning_rate': 4.5833e-07, 'epoch': 0.71, 'throughput': 1475.65}
328
-
329
- [INFO|callbacks.py:310] 2024-07-11 09:46:31,747 >> {'loss': 0.9604, 'learning_rate': 4.6667e-07, 'epoch': 0.72, 'throughput': 1475.05}
330
-
331
- [INFO|callbacks.py:310] 2024-07-11 09:46:42,858 >> {'loss': 0.6287, 'learning_rate': 4.7500e-07, 'epoch': 0.73, 'throughput': 1475.70}
332
-
333
- [INFO|callbacks.py:310] 2024-07-11 09:46:53,999 >> {'loss': 0.4844, 'learning_rate': 4.8333e-07, 'epoch': 0.75, 'throughput': 1476.45}
334
-
335
- [INFO|callbacks.py:310] 2024-07-11 09:47:05,114 >> {'loss': 0.3891, 'learning_rate': 4.9167e-07, 'epoch': 0.76, 'throughput': 1475.83}
336
-
337
- [INFO|callbacks.py:310] 2024-07-11 09:47:16,243 >> {'loss': 0.3570, 'learning_rate': 5.0000e-07, 'epoch': 0.77, 'throughput': 1476.05}
338
-
339
- [INFO|callbacks.py:310] 2024-07-11 09:47:27,365 >> {'loss': 0.2854, 'learning_rate': 5.0833e-07, 'epoch': 0.78, 'throughput': 1476.53}
340
-
341
- [INFO|callbacks.py:310] 2024-07-11 09:47:38,471 >> {'loss': 0.2403, 'learning_rate': 5.1667e-07, 'epoch': 0.80, 'throughput': 1476.18}
342
-
343
- [INFO|callbacks.py:310] 2024-07-11 09:47:49,613 >> {'loss': 0.2522, 'learning_rate': 5.2500e-07, 'epoch': 0.81, 'throughput': 1477.05}
344
-
345
- [INFO|callbacks.py:310] 2024-07-11 09:48:00,728 >> {'loss': 0.2339, 'learning_rate': 5.3333e-07, 'epoch': 0.82, 'throughput': 1476.51}
346
-
347
- [INFO|callbacks.py:310] 2024-07-11 09:48:11,827 >> {'loss': 0.2214, 'learning_rate': 5.4167e-07, 'epoch': 0.84, 'throughput': 1477.43}
348
-
349
- [INFO|callbacks.py:310] 2024-07-11 09:48:22,937 >> {'loss': 0.1912, 'learning_rate': 5.5000e-07, 'epoch': 0.85, 'throughput': 1478.13}
350
-
351
- [INFO|callbacks.py:310] 2024-07-11 09:48:34,066 >> {'loss': 0.2157, 'learning_rate': 5.5833e-07, 'epoch': 0.86, 'throughput': 1479.16}
352
-
353
- [INFO|callbacks.py:310] 2024-07-11 09:48:45,169 >> {'loss': 0.2045, 'learning_rate': 5.6667e-07, 'epoch': 0.87, 'throughput': 1478.13}
354
-
355
- [INFO|callbacks.py:310] 2024-07-11 09:48:56,325 >> {'loss': 0.1964, 'learning_rate': 5.7500e-07, 'epoch': 0.89, 'throughput': 1478.54}
356
-
357
- [INFO|callbacks.py:310] 2024-07-11 09:49:07,421 >> {'loss': 0.1901, 'learning_rate': 5.8333e-07, 'epoch': 0.90, 'throughput': 1477.07}
358
-
359
- [INFO|callbacks.py:310] 2024-07-11 09:49:18,532 >> {'loss': 0.1891, 'learning_rate': 5.9167e-07, 'epoch': 0.91, 'throughput': 1477.00}
360
-
361
- [INFO|callbacks.py:310] 2024-07-11 09:49:29,697 >> {'loss': 0.1841, 'learning_rate': 6.0000e-07, 'epoch': 0.93, 'throughput': 1477.26}
362
-
363
- [INFO|callbacks.py:310] 2024-07-11 09:49:40,800 >> {'loss': 0.1803, 'learning_rate': 6.0833e-07, 'epoch': 0.94, 'throughput': 1477.28}
364
-
365
- [INFO|callbacks.py:310] 2024-07-11 09:49:51,887 >> {'loss': 0.1657, 'learning_rate': 6.1667e-07, 'epoch': 0.95, 'throughput': 1477.17}
366
-
367
- [INFO|callbacks.py:310] 2024-07-11 09:50:02,989 >> {'loss': 0.2157, 'learning_rate': 6.2500e-07, 'epoch': 0.96, 'throughput': 1477.26}
368
-
369
- [INFO|callbacks.py:310] 2024-07-11 09:50:14,088 >> {'loss': 0.2000, 'learning_rate': 6.3333e-07, 'epoch': 0.98, 'throughput': 1477.40}
370
-
371
- [INFO|callbacks.py:310] 2024-07-11 09:50:25,168 >> {'loss': 0.1519, 'learning_rate': 6.4167e-07, 'epoch': 0.99, 'throughput': 1476.86}
372
-
373
- [INFO|callbacks.py:310] 2024-07-11 09:50:36,302 >> {'loss': 0.1765, 'learning_rate': 6.5000e-07, 'epoch': 1.00, 'throughput': 1477.71}
374
-
375
- [INFO|callbacks.py:310] 2024-07-11 09:50:47,461 >> {'loss': 0.1907, 'learning_rate': 6.5833e-07, 'epoch': 1.02, 'throughput': 1478.10}
376
-
377
- [INFO|callbacks.py:310] 2024-07-11 09:50:58,581 >> {'loss': 0.1593, 'learning_rate': 6.6667e-07, 'epoch': 1.03, 'throughput': 1477.21}
378
-
379
- [INFO|callbacks.py:310] 2024-07-11 09:51:09,696 >> {'loss': 0.1659, 'learning_rate': 6.7500e-07, 'epoch': 1.04, 'throughput': 1477.14}
380
-
381
- [INFO|callbacks.py:310] 2024-07-11 09:51:20,799 >> {'loss': 0.1619, 'learning_rate': 6.8333e-07, 'epoch': 1.05, 'throughput': 1477.58}
382
-
383
- [INFO|callbacks.py:310] 2024-07-11 09:51:31,873 >> {'loss': 0.1680, 'learning_rate': 6.9167e-07, 'epoch': 1.07, 'throughput': 1477.08}
384
-
385
- [INFO|callbacks.py:310] 2024-07-11 09:51:43,028 >> {'loss': 0.1627, 'learning_rate': 7.0000e-07, 'epoch': 1.08, 'throughput': 1477.67}
386
-
387
- [INFO|callbacks.py:310] 2024-07-11 09:51:54,163 >> {'loss': 0.1421, 'learning_rate': 7.0833e-07, 'epoch': 1.09, 'throughput': 1478.13}
388
-
389
- [INFO|callbacks.py:310] 2024-07-11 09:52:05,268 >> {'loss': 0.1517, 'learning_rate': 7.1667e-07, 'epoch': 1.11, 'throughput': 1478.20}
390
-
391
- [INFO|callbacks.py:310] 2024-07-11 09:52:16,378 >> {'loss': 0.1519, 'learning_rate': 7.2500e-07, 'epoch': 1.12, 'throughput': 1477.17}
392
-
393
- [INFO|callbacks.py:310] 2024-07-11 09:52:27,517 >> {'loss': 0.1280, 'learning_rate': 7.3333e-07, 'epoch': 1.13, 'throughput': 1477.75}
394
-
395
- [INFO|callbacks.py:310] 2024-07-11 09:52:38,648 >> {'loss': 0.1481, 'learning_rate': 7.4167e-07, 'epoch': 1.14, 'throughput': 1477.59}
396
-
397
- [INFO|callbacks.py:310] 2024-07-11 09:52:49,791 >> {'loss': 0.1636, 'learning_rate': 7.5000e-07, 'epoch': 1.16, 'throughput': 1478.21}
398
-
399
- [INFO|callbacks.py:310] 2024-07-11 09:53:00,912 >> {'loss': 0.1443, 'learning_rate': 7.5833e-07, 'epoch': 1.17, 'throughput': 1478.70}
400
-
401
- [INFO|callbacks.py:310] 2024-07-11 09:53:12,019 >> {'loss': 0.1592, 'learning_rate': 7.6667e-07, 'epoch': 1.18, 'throughput': 1479.03}
402
-
403
- [INFO|callbacks.py:310] 2024-07-11 09:53:23,069 >> {'loss': 0.1744, 'learning_rate': 7.7500e-07, 'epoch': 1.20, 'throughput': 1478.61}
404
-
405
- [INFO|callbacks.py:310] 2024-07-11 09:53:34,191 >> {'loss': 0.1615, 'learning_rate': 7.8333e-07, 'epoch': 1.21, 'throughput': 1478.86}
406
-
407
- [INFO|callbacks.py:310] 2024-07-11 09:53:45,333 >> {'loss': 0.1466, 'learning_rate': 7.9167e-07, 'epoch': 1.22, 'throughput': 1480.34}
408
-
409
- [INFO|callbacks.py:310] 2024-07-11 09:53:56,455 >> {'loss': 0.1333, 'learning_rate': 8.0000e-07, 'epoch': 1.23, 'throughput': 1480.18}
410
-
411
- [INFO|callbacks.py:310] 2024-07-11 09:54:07,585 >> {'loss': 0.1475, 'learning_rate': 8.0833e-07, 'epoch': 1.25, 'throughput': 1479.91}
412
-
413
- [INFO|callbacks.py:310] 2024-07-11 09:54:18,772 >> {'loss': 0.1420, 'learning_rate': 8.1667e-07, 'epoch': 1.26, 'throughput': 1480.14}
414
-
415
- [INFO|callbacks.py:310] 2024-07-11 09:54:29,887 >> {'loss': 0.1480, 'learning_rate': 8.2500e-07, 'epoch': 1.27, 'throughput': 1480.08}
416
-
417
- [INFO|callbacks.py:310] 2024-07-11 09:54:40,939 >> {'loss': 0.1539, 'learning_rate': 8.3333e-07, 'epoch': 1.29, 'throughput': 1478.98}
418
-
419
- [INFO|callbacks.py:310] 2024-07-11 09:54:52,040 >> {'loss': 0.1496, 'learning_rate': 8.4167e-07, 'epoch': 1.30, 'throughput': 1479.63}
420
-
421
- [INFO|callbacks.py:310] 2024-07-11 09:55:03,133 >> {'loss': 0.1563, 'learning_rate': 8.5000e-07, 'epoch': 1.31, 'throughput': 1479.38}
422
-
423
- [INFO|callbacks.py:310] 2024-07-11 09:55:14,217 >> {'loss': 0.1295, 'learning_rate': 8.5833e-07, 'epoch': 1.32, 'throughput': 1478.95}
424
-
425
- [INFO|callbacks.py:310] 2024-07-11 09:55:25,352 >> {'loss': 0.1353, 'learning_rate': 8.6667e-07, 'epoch': 1.34, 'throughput': 1478.71}
426
-
427
- [INFO|callbacks.py:310] 2024-07-11 09:55:36,494 >> {'loss': 0.1376, 'learning_rate': 8.7500e-07, 'epoch': 1.35, 'throughput': 1478.25}
428
-
429
- [INFO|callbacks.py:310] 2024-07-11 09:55:47,650 >> {'loss': 0.1313, 'learning_rate': 8.8333e-07, 'epoch': 1.36, 'throughput': 1478.13}
430
-
431
- [INFO|callbacks.py:310] 2024-07-11 09:55:58,764 >> {'loss': 0.1367, 'learning_rate': 8.9167e-07, 'epoch': 1.38, 'throughput': 1477.58}
432
-
433
- [INFO|callbacks.py:310] 2024-07-11 09:56:09,911 >> {'loss': 0.1350, 'learning_rate': 9.0000e-07, 'epoch': 1.39, 'throughput': 1477.67}
434
-
435
- [INFO|callbacks.py:310] 2024-07-11 09:56:20,982 >> {'loss': 0.1212, 'learning_rate': 9.0833e-07, 'epoch': 1.40, 'throughput': 1477.90}
436
-
437
- [INFO|callbacks.py:310] 2024-07-11 09:56:32,078 >> {'loss': 0.1355, 'learning_rate': 9.1667e-07, 'epoch': 1.41, 'throughput': 1478.26}
438
-
439
- [INFO|callbacks.py:310] 2024-07-11 09:56:43,192 >> {'loss': 0.1439, 'learning_rate': 9.2500e-07, 'epoch': 1.43, 'throughput': 1478.53}
440
-
441
- [INFO|callbacks.py:310] 2024-07-11 09:56:54,346 >> {'loss': 0.1323, 'learning_rate': 9.3333e-07, 'epoch': 1.44, 'throughput': 1478.60}
442
-
443
- [INFO|callbacks.py:310] 2024-07-11 09:57:05,472 >> {'loss': 0.1376, 'learning_rate': 9.4167e-07, 'epoch': 1.45, 'throughput': 1478.52}
444
-
445
- [INFO|callbacks.py:310] 2024-07-11 09:57:16,583 >> {'loss': 0.1191, 'learning_rate': 9.5000e-07, 'epoch': 1.47, 'throughput': 1478.08}
446
-
447
- [INFO|callbacks.py:310] 2024-07-11 09:57:27,706 >> {'loss': 0.1045, 'learning_rate': 9.5833e-07, 'epoch': 1.48, 'throughput': 1477.61}
448
-
449
- [INFO|callbacks.py:310] 2024-07-11 09:57:38,811 >> {'loss': 0.1467, 'learning_rate': 9.6667e-07, 'epoch': 1.49, 'throughput': 1476.92}
450
-
451
- [INFO|callbacks.py:310] 2024-07-11 09:57:49,937 >> {'loss': 0.1142, 'learning_rate': 9.7500e-07, 'epoch': 1.50, 'throughput': 1477.03}
452
-
453
- [INFO|callbacks.py:310] 2024-07-11 09:58:01,037 >> {'loss': 0.1107, 'learning_rate': 9.8333e-07, 'epoch': 1.52, 'throughput': 1476.73}
454
-
455
- [INFO|callbacks.py:310] 2024-07-11 09:58:12,150 >> {'loss': 0.1460, 'learning_rate': 9.9167e-07, 'epoch': 1.53, 'throughput': 1477.00}
456
-
457
- [INFO|callbacks.py:310] 2024-07-11 09:58:23,267 >> {'loss': 0.1533, 'learning_rate': 1.0000e-06, 'epoch': 1.54, 'throughput': 1477.02}
458
-
459
- [INFO|callbacks.py:310] 2024-07-11 09:58:34,358 >> {'loss': 0.1315, 'learning_rate': 1.0083e-06, 'epoch': 1.56, 'throughput': 1476.81}
460
-
461
- [INFO|callbacks.py:310] 2024-07-11 09:58:45,523 >> {'loss': 0.1197, 'learning_rate': 1.0167e-06, 'epoch': 1.57, 'throughput': 1476.92}
462
-
463
- [INFO|callbacks.py:310] 2024-07-11 09:58:56,666 >> {'loss': 0.1228, 'learning_rate': 1.0250e-06, 'epoch': 1.58, 'throughput': 1477.06}
464
-
465
- [INFO|callbacks.py:310] 2024-07-11 09:59:07,814 >> {'loss': 0.1273, 'learning_rate': 1.0333e-06, 'epoch': 1.59, 'throughput': 1477.06}
466
-
467
- [INFO|callbacks.py:310] 2024-07-11 09:59:18,959 >> {'loss': 0.1410, 'learning_rate': 1.0417e-06, 'epoch': 1.61, 'throughput': 1477.45}
468
-
469
- [INFO|callbacks.py:310] 2024-07-11 09:59:30,083 >> {'loss': 0.1315, 'learning_rate': 1.0500e-06, 'epoch': 1.62, 'throughput': 1477.73}
470
-
471
- [INFO|callbacks.py:310] 2024-07-11 09:59:41,190 >> {'loss': 0.1136, 'learning_rate': 1.0583e-06, 'epoch': 1.63, 'throughput': 1478.09}
472
-
473
- [INFO|callbacks.py:310] 2024-07-11 09:59:52,299 >> {'loss': 0.1013, 'learning_rate': 1.0667e-06, 'epoch': 1.65, 'throughput': 1478.54}
474
-
475
- [INFO|callbacks.py:310] 2024-07-11 10:00:03,422 >> {'loss': 0.1056, 'learning_rate': 1.0750e-06, 'epoch': 1.66, 'throughput': 1478.14}
476
-
477
- [INFO|callbacks.py:310] 2024-07-11 10:00:14,531 >> {'loss': 0.1071, 'learning_rate': 1.0833e-06, 'epoch': 1.67, 'throughput': 1478.54}
478
-
479
- [INFO|callbacks.py:310] 2024-07-11 10:00:25,649 >> {'loss': 0.1357, 'learning_rate': 1.0917e-06, 'epoch': 1.68, 'throughput': 1478.41}
480
-
481
- [INFO|callbacks.py:310] 2024-07-11 10:00:36,774 >> {'loss': 0.1181, 'learning_rate': 1.1000e-06, 'epoch': 1.70, 'throughput': 1478.26}
482
-
483
- [INFO|callbacks.py:310] 2024-07-11 10:00:47,927 >> {'loss': 0.0826, 'learning_rate': 1.1083e-06, 'epoch': 1.71, 'throughput': 1478.30}
484
-
485
- [INFO|callbacks.py:310] 2024-07-11 10:00:59,041 >> {'loss': 0.1221, 'learning_rate': 1.1167e-06, 'epoch': 1.72, 'throughput': 1477.92}
486
-
487
- [INFO|callbacks.py:310] 2024-07-11 10:01:10,170 >> {'loss': 0.1021, 'learning_rate': 1.1250e-06, 'epoch': 1.74, 'throughput': 1478.20}
488
-
489
- [INFO|callbacks.py:310] 2024-07-11 10:01:21,237 >> {'loss': 0.0980, 'learning_rate': 1.1333e-06, 'epoch': 1.75, 'throughput': 1478.17}
490
-
491
- [INFO|callbacks.py:310] 2024-07-11 10:01:32,344 >> {'loss': 0.1085, 'learning_rate': 1.1417e-06, 'epoch': 1.76, 'throughput': 1478.40}
492
-
493
- [INFO|callbacks.py:310] 2024-07-11 10:01:43,423 >> {'loss': 0.1038, 'learning_rate': 1.1500e-06, 'epoch': 1.77, 'throughput': 1478.07}
494
-
495
- [INFO|callbacks.py:310] 2024-07-11 10:01:54,521 >> {'loss': 0.1081, 'learning_rate': 1.1583e-06, 'epoch': 1.79, 'throughput': 1478.19}
496
-
497
- [INFO|callbacks.py:310] 2024-07-11 10:02:05,652 >> {'loss': 0.1287, 'learning_rate': 1.1667e-06, 'epoch': 1.80, 'throughput': 1478.10}
498
-
499
- [INFO|callbacks.py:310] 2024-07-11 10:02:16,763 >> {'loss': 0.1068, 'learning_rate': 1.1750e-06, 'epoch': 1.81, 'throughput': 1478.11}
500
-
501
- [INFO|callbacks.py:310] 2024-07-11 10:02:27,850 >> {'loss': 0.1202, 'learning_rate': 1.1833e-06, 'epoch': 1.83, 'throughput': 1477.67}
502
-
503
- [INFO|callbacks.py:310] 2024-07-11 10:02:38,969 >> {'loss': 0.1190, 'learning_rate': 1.1917e-06, 'epoch': 1.84, 'throughput': 1477.42}
504
-
505
- [INFO|callbacks.py:310] 2024-07-11 10:02:50,085 >> {'loss': 0.1273, 'learning_rate': 1.2000e-06, 'epoch': 1.85, 'throughput': 1477.51}
506
-
507
- [INFO|callbacks.py:310] 2024-07-11 10:03:01,139 >> {'loss': 0.1024, 'learning_rate': 1.2083e-06, 'epoch': 1.86, 'throughput': 1477.11}
508
-
509
- [INFO|callbacks.py:310] 2024-07-11 10:03:12,232 >> {'loss': 0.1143, 'learning_rate': 1.2167e-06, 'epoch': 1.88, 'throughput': 1476.97}
510
-
511
- [INFO|callbacks.py:310] 2024-07-11 10:03:23,344 >> {'loss': 0.1229, 'learning_rate': 1.2250e-06, 'epoch': 1.89, 'throughput': 1477.01}
512
-
513
- [INFO|callbacks.py:310] 2024-07-11 10:03:34,449 >> {'loss': 0.0964, 'learning_rate': 1.2333e-06, 'epoch': 1.90, 'throughput': 1476.61}
514
-
515
- [INFO|callbacks.py:310] 2024-07-11 10:03:45,575 >> {'loss': 0.1132, 'learning_rate': 1.2417e-06, 'epoch': 1.92, 'throughput': 1477.03}
516
-
517
- [INFO|callbacks.py:310] 2024-07-11 10:03:56,704 >> {'loss': 0.0734, 'learning_rate': 1.2500e-06, 'epoch': 1.93, 'throughput': 1476.98}
518
 
519
- [INFO|callbacks.py:310] 2024-07-11 10:04:07,855 >> {'loss': 0.0939, 'learning_rate': 1.2583e-06, 'epoch': 1.94, 'throughput': 1477.57}
520
 
521
- [INFO|callbacks.py:310] 2024-07-11 10:04:18,984 >> {'loss': 0.1143, 'learning_rate': 1.2667e-06, 'epoch': 1.95, 'throughput': 1477.48}
522
 
523
- [INFO|callbacks.py:310] 2024-07-11 10:04:30,044 >> {'loss': 0.1114, 'learning_rate': 1.2750e-06, 'epoch': 1.97, 'throughput': 1476.92}
524
 
525
- [INFO|callbacks.py:310] 2024-07-11 10:04:41,125 >> {'loss': 0.0948, 'learning_rate': 1.2833e-06, 'epoch': 1.98, 'throughput': 1476.58}
526
 
527
- [INFO|callbacks.py:310] 2024-07-11 10:04:52,252 >> {'loss': 0.0805, 'learning_rate': 1.2917e-06, 'epoch': 1.99, 'throughput': 1476.85}
528
 
529
- [INFO|callbacks.py:310] 2024-07-11 10:05:03,374 >> {'loss': 0.1001, 'learning_rate': 1.3000e-06, 'epoch': 2.01, 'throughput': 1476.62}
530
 
531
- [INFO|callbacks.py:310] 2024-07-11 10:05:14,515 >> {'loss': 0.1002, 'learning_rate': 1.3083e-06, 'epoch': 2.02, 'throughput': 1476.44}
532
 
533
- [INFO|callbacks.py:310] 2024-07-11 10:05:25,648 >> {'loss': 0.0847, 'learning_rate': 1.3167e-06, 'epoch': 2.03, 'throughput': 1476.26}
534
 
535
- [INFO|callbacks.py:310] 2024-07-11 10:05:36,802 >> {'loss': 0.0710, 'learning_rate': 1.3250e-06, 'epoch': 2.05, 'throughput': 1476.41}
536
 
537
- [INFO|callbacks.py:310] 2024-07-11 10:05:47,915 >> {'loss': 0.0791, 'learning_rate': 1.3333e-06, 'epoch': 2.06, 'throughput': 1476.00}
538
 
539
- [INFO|callbacks.py:310] 2024-07-11 10:05:59,027 >> {'loss': 0.0769, 'learning_rate': 1.3417e-06, 'epoch': 2.07, 'throughput': 1475.83}
540
 
541
- [INFO|callbacks.py:310] 2024-07-11 10:06:10,091 >> {'loss': 0.0916, 'learning_rate': 1.3500e-06, 'epoch': 2.08, 'throughput': 1475.46}
542
-
543
- [INFO|callbacks.py:310] 2024-07-11 10:06:21,185 >> {'loss': 0.0584, 'learning_rate': 1.3583e-06, 'epoch': 2.10, 'throughput': 1475.48}
544
-
545
- [INFO|callbacks.py:310] 2024-07-11 10:06:32,281 >> {'loss': 0.0811, 'learning_rate': 1.3667e-06, 'epoch': 2.11, 'throughput': 1475.35}
546
-
547
- [INFO|callbacks.py:310] 2024-07-11 10:06:43,373 >> {'loss': 0.0663, 'learning_rate': 1.3750e-06, 'epoch': 2.12, 'throughput': 1475.17}
548
-
549
- [INFO|callbacks.py:310] 2024-07-11 10:06:54,462 >> {'loss': 0.0802, 'learning_rate': 1.3833e-06, 'epoch': 2.14, 'throughput': 1475.13}
550
-
551
- [INFO|callbacks.py:310] 2024-07-11 10:07:05,548 >> {'loss': 0.0587, 'learning_rate': 1.3917e-06, 'epoch': 2.15, 'throughput': 1474.82}
552
-
553
- [INFO|callbacks.py:310] 2024-07-11 10:07:16,701 >> {'loss': 0.0953, 'learning_rate': 1.4000e-06, 'epoch': 2.16, 'throughput': 1474.73}
554
-
555
- [INFO|callbacks.py:310] 2024-07-11 10:07:27,833 >> {'loss': 0.0795, 'learning_rate': 1.4083e-06, 'epoch': 2.17, 'throughput': 1474.63}
556
-
557
- [INFO|callbacks.py:310] 2024-07-11 10:07:38,959 >> {'loss': 0.1015, 'learning_rate': 1.4167e-06, 'epoch': 2.19, 'throughput': 1474.63}
558
-
559
- [INFO|callbacks.py:310] 2024-07-11 10:07:50,039 >> {'loss': 0.0614, 'learning_rate': 1.4250e-06, 'epoch': 2.20, 'throughput': 1474.37}
560
-
561
- [INFO|callbacks.py:310] 2024-07-11 10:08:01,150 >> {'loss': 0.0753, 'learning_rate': 1.4333e-06, 'epoch': 2.21, 'throughput': 1474.59}
562
-
563
- [INFO|callbacks.py:310] 2024-07-11 10:08:12,309 >> {'loss': 0.0800, 'learning_rate': 1.4417e-06, 'epoch': 2.23, 'throughput': 1475.05}
564
-
565
- [INFO|callbacks.py:310] 2024-07-11 10:08:23,444 >> {'loss': 0.0603, 'learning_rate': 1.4500e-06, 'epoch': 2.24, 'throughput': 1475.13}
566
-
567
- [INFO|callbacks.py:310] 2024-07-11 10:08:34,582 >> {'loss': 0.0874, 'learning_rate': 1.4583e-06, 'epoch': 2.25, 'throughput': 1475.01}
568
-
569
- [INFO|callbacks.py:310] 2024-07-11 10:08:45,695 >> {'loss': 0.0833, 'learning_rate': 1.4667e-06, 'epoch': 2.26, 'throughput': 1475.02}
570
-
571
- [INFO|callbacks.py:310] 2024-07-11 10:08:56,832 >> {'loss': 0.0676, 'learning_rate': 1.4750e-06, 'epoch': 2.28, 'throughput': 1474.80}
572
-
573
- [INFO|callbacks.py:310] 2024-07-11 10:09:07,950 >> {'loss': 0.0860, 'learning_rate': 1.4833e-06, 'epoch': 2.29, 'throughput': 1474.96}
574
-
575
- [INFO|callbacks.py:310] 2024-07-11 10:09:19,042 >> {'loss': 0.0662, 'learning_rate': 1.4917e-06, 'epoch': 2.30, 'throughput': 1474.55}
576
-
577
- [INFO|callbacks.py:310] 2024-07-11 10:09:30,161 >> {'loss': 0.1028, 'learning_rate': 1.5000e-06, 'epoch': 2.32, 'throughput': 1474.63}
578
-
579
- [INFO|callbacks.py:310] 2024-07-11 10:09:41,272 >> {'loss': 0.0882, 'learning_rate': 1.5083e-06, 'epoch': 2.33, 'throughput': 1474.45}
580
-
581
- [INFO|callbacks.py:310] 2024-07-11 10:09:52,377 >> {'loss': 0.0806, 'learning_rate': 1.5167e-06, 'epoch': 2.34, 'throughput': 1474.52}
582
-
583
- [INFO|callbacks.py:310] 2024-07-11 10:10:03,473 >> {'loss': 0.1108, 'learning_rate': 1.5250e-06, 'epoch': 2.35, 'throughput': 1474.19}
584
-
585
- [INFO|callbacks.py:310] 2024-07-11 10:10:14,639 >> {'loss': 0.0687, 'learning_rate': 1.5333e-06, 'epoch': 2.37, 'throughput': 1474.68}
586
-
587
- [INFO|callbacks.py:310] 2024-07-11 10:10:25,742 >> {'loss': 0.0736, 'learning_rate': 1.5417e-06, 'epoch': 2.38, 'throughput': 1474.67}
588
-
589
- [INFO|callbacks.py:310] 2024-07-11 10:10:36,853 >> {'loss': 0.0779, 'learning_rate': 1.5500e-06, 'epoch': 2.39, 'throughput': 1474.70}
590
-
591
- [INFO|callbacks.py:310] 2024-07-11 10:10:47,981 >> {'loss': 0.0709, 'learning_rate': 1.5583e-06, 'epoch': 2.41, 'throughput': 1474.61}
592
-
593
- [INFO|callbacks.py:310] 2024-07-11 10:10:59,101 >> {'loss': 0.0652, 'learning_rate': 1.5667e-06, 'epoch': 2.42, 'throughput': 1474.76}
594
-
595
- [INFO|callbacks.py:310] 2024-07-11 10:11:10,209 >> {'loss': 0.1095, 'learning_rate': 1.5750e-06, 'epoch': 2.43, 'throughput': 1475.00}
596
-
597
- [INFO|callbacks.py:310] 2024-07-11 10:11:21,338 >> {'loss': 0.0618, 'learning_rate': 1.5833e-06, 'epoch': 2.44, 'throughput': 1475.18}
598
-
599
- [INFO|callbacks.py:310] 2024-07-11 10:11:32,451 >> {'loss': 0.0666, 'learning_rate': 1.5917e-06, 'epoch': 2.46, 'throughput': 1475.36}
600
-
601
- [INFO|callbacks.py:310] 2024-07-11 10:11:43,569 >> {'loss': 0.0573, 'learning_rate': 1.6000e-06, 'epoch': 2.47, 'throughput': 1475.40}
602
-
603
- [INFO|callbacks.py:310] 2024-07-11 10:11:54,697 >> {'loss': 0.0577, 'learning_rate': 1.6083e-06, 'epoch': 2.48, 'throughput': 1475.50}
604
-
605
- [INFO|callbacks.py:310] 2024-07-11 10:12:05,820 >> {'loss': 0.0813, 'learning_rate': 1.6167e-06, 'epoch': 2.50, 'throughput': 1475.34}
606
-
607
- [INFO|callbacks.py:310] 2024-07-11 10:12:16,940 >> {'loss': 0.0660, 'learning_rate': 1.6250e-06, 'epoch': 2.51, 'throughput': 1475.24}
608
-
609
- [INFO|callbacks.py:310] 2024-07-11 10:12:28,083 >> {'loss': 0.0622, 'learning_rate': 1.6333e-06, 'epoch': 2.52, 'throughput': 1475.32}
610
-
611
- [INFO|callbacks.py:310] 2024-07-11 10:12:39,175 >> {'loss': 0.0616, 'learning_rate': 1.6417e-06, 'epoch': 2.53, 'throughput': 1475.12}
612
-
613
- [INFO|callbacks.py:310] 2024-07-11 10:12:50,265 >> {'loss': 0.0993, 'learning_rate': 1.6500e-06, 'epoch': 2.55, 'throughput': 1475.12}
614
-
615
- [INFO|callbacks.py:310] 2024-07-11 10:13:01,337 >> {'loss': 0.0702, 'learning_rate': 1.6583e-06, 'epoch': 2.56, 'throughput': 1475.10}
616
-
617
- [INFO|callbacks.py:310] 2024-07-11 10:13:12,425 >> {'loss': 0.0743, 'learning_rate': 1.6667e-06, 'epoch': 2.57, 'throughput': 1474.97}
618
-
619
- [INFO|callbacks.py:310] 2024-07-11 10:13:23,525 >> {'loss': 0.0647, 'learning_rate': 1.6750e-06, 'epoch': 2.59, 'throughput': 1474.93}
620
-
621
- [INFO|callbacks.py:310] 2024-07-11 10:13:34,651 >> {'loss': 0.0814, 'learning_rate': 1.6833e-06, 'epoch': 2.60, 'throughput': 1475.03}
622
-
623
- [INFO|callbacks.py:310] 2024-07-11 10:13:45,757 >> {'loss': 0.0861, 'learning_rate': 1.6917e-06, 'epoch': 2.61, 'throughput': 1474.89}
624
-
625
- [INFO|callbacks.py:310] 2024-07-11 10:13:56,905 >> {'loss': 0.0769, 'learning_rate': 1.7000e-06, 'epoch': 2.62, 'throughput': 1475.15}
626
-
627
- [INFO|callbacks.py:310] 2024-07-11 10:14:08,051 >> {'loss': 0.0888, 'learning_rate': 1.7083e-06, 'epoch': 2.64, 'throughput': 1475.31}
628
-
629
- [INFO|callbacks.py:310] 2024-07-11 10:14:19,164 >> {'loss': 0.1017, 'learning_rate': 1.7167e-06, 'epoch': 2.65, 'throughput': 1475.34}
630
-
631
- [INFO|callbacks.py:310] 2024-07-11 10:14:30,261 >> {'loss': 0.0677, 'learning_rate': 1.7250e-06, 'epoch': 2.66, 'throughput': 1475.25}
632
-
633
- [INFO|callbacks.py:310] 2024-07-11 10:14:41,375 >> {'loss': 0.0861, 'learning_rate': 1.7333e-06, 'epoch': 2.68, 'throughput': 1475.55}
634
-
635
- [INFO|callbacks.py:310] 2024-07-11 10:14:52,497 >> {'loss': 0.0562, 'learning_rate': 1.7417e-06, 'epoch': 2.69, 'throughput': 1475.69}
636
-
637
- [INFO|callbacks.py:310] 2024-07-11 10:15:03,612 >> {'loss': 0.0641, 'learning_rate': 1.7500e-06, 'epoch': 2.70, 'throughput': 1475.59}
638
-
639
- [INFO|callbacks.py:310] 2024-07-11 10:15:14,730 >> {'loss': 0.0829, 'learning_rate': 1.7583e-06, 'epoch': 2.71, 'throughput': 1475.88}
640
-
641
- [INFO|callbacks.py:310] 2024-07-11 10:15:25,844 >> {'loss': 0.0632, 'learning_rate': 1.7667e-06, 'epoch': 2.73, 'throughput': 1475.70}
642
-
643
- [INFO|callbacks.py:310] 2024-07-11 10:15:36,953 >> {'loss': 0.0620, 'learning_rate': 1.7750e-06, 'epoch': 2.74, 'throughput': 1475.36}
644
-
645
- [INFO|callbacks.py:310] 2024-07-11 10:15:48,070 >> {'loss': 0.0816, 'learning_rate': 1.7833e-06, 'epoch': 2.75, 'throughput': 1475.14}
646
-
647
- [INFO|callbacks.py:310] 2024-07-11 10:15:59,182 >> {'loss': 0.0960, 'learning_rate': 1.7917e-06, 'epoch': 2.77, 'throughput': 1475.26}
648
-
649
- [INFO|callbacks.py:310] 2024-07-11 10:16:10,296 >> {'loss': 0.0639, 'learning_rate': 1.8000e-06, 'epoch': 2.78, 'throughput': 1475.21}
650
-
651
- [INFO|callbacks.py:310] 2024-07-11 10:16:21,386 >> {'loss': 0.0915, 'learning_rate': 1.8083e-06, 'epoch': 2.79, 'throughput': 1474.69}
652
-
653
- [INFO|callbacks.py:310] 2024-07-11 10:16:32,501 >> {'loss': 0.0610, 'learning_rate': 1.8167e-06, 'epoch': 2.80, 'throughput': 1474.53}
654
-
655
- [INFO|callbacks.py:310] 2024-07-11 10:16:43,658 >> {'loss': 0.0572, 'learning_rate': 1.8250e-06, 'epoch': 2.82, 'throughput': 1474.35}
656
-
657
- [INFO|callbacks.py:310] 2024-07-11 10:16:54,814 >> {'loss': 0.0497, 'learning_rate': 1.8333e-06, 'epoch': 2.83, 'throughput': 1474.44}
658
-
659
- [INFO|callbacks.py:310] 2024-07-11 10:17:05,943 >> {'loss': 0.0672, 'learning_rate': 1.8417e-06, 'epoch': 2.84, 'throughput': 1474.31}
660
-
661
- [INFO|callbacks.py:310] 2024-07-11 10:17:17,069 >> {'loss': 0.0563, 'learning_rate': 1.8500e-06, 'epoch': 2.86, 'throughput': 1474.33}
662
-
663
- [INFO|callbacks.py:310] 2024-07-11 10:17:28,189 >> {'loss': 0.0690, 'learning_rate': 1.8583e-06, 'epoch': 2.87, 'throughput': 1474.50}
664
-
665
- [INFO|callbacks.py:310] 2024-07-11 10:17:39,309 >> {'loss': 0.0824, 'learning_rate': 1.8667e-06, 'epoch': 2.88, 'throughput': 1474.59}
666
-
667
- [INFO|callbacks.py:310] 2024-07-11 10:17:50,430 >> {'loss': 0.0570, 'learning_rate': 1.8750e-06, 'epoch': 2.89, 'throughput': 1474.72}
668
-
669
- [INFO|callbacks.py:310] 2024-07-11 10:18:01,560 >> {'loss': 0.0549, 'learning_rate': 1.8833e-06, 'epoch': 2.91, 'throughput': 1475.08}
670
-
671
- [INFO|callbacks.py:310] 2024-07-11 10:18:12,661 >> {'loss': 0.0652, 'learning_rate': 1.8917e-06, 'epoch': 2.92, 'throughput': 1474.98}
672
-
673
- [INFO|callbacks.py:310] 2024-07-11 10:18:23,802 >> {'loss': 0.0743, 'learning_rate': 1.9000e-06, 'epoch': 2.93, 'throughput': 1475.20}
674
-
675
- [INFO|callbacks.py:310] 2024-07-11 10:18:34,940 >> {'loss': 0.0416, 'learning_rate': 1.9083e-06, 'epoch': 2.95, 'throughput': 1475.35}
676
-
677
- [INFO|callbacks.py:310] 2024-07-11 10:18:46,091 >> {'loss': 0.0668, 'learning_rate': 1.9167e-06, 'epoch': 2.96, 'throughput': 1475.75}
678
-
679
- [INFO|callbacks.py:310] 2024-07-11 10:18:57,199 >> {'loss': 0.0603, 'learning_rate': 1.9250e-06, 'epoch': 2.97, 'throughput': 1475.60}
680
-
681
- [INFO|callbacks.py:310] 2024-07-11 10:19:08,355 >> {'loss': 0.0660, 'learning_rate': 1.9333e-06, 'epoch': 2.98, 'throughput': 1475.88}
682
-
683
- [INFO|callbacks.py:310] 2024-07-11 10:19:19,452 >> {'loss': 0.0692, 'learning_rate': 1.9417e-06, 'epoch': 3.00, 'throughput': 1475.87}
684
-
685
- [INFO|callbacks.py:310] 2024-07-11 10:19:30,549 >> {'loss': 0.0323, 'learning_rate': 1.9500e-06, 'epoch': 3.01, 'throughput': 1475.89}
686
-
687
- [INFO|callbacks.py:310] 2024-07-11 10:19:41,651 >> {'loss': 0.0403, 'learning_rate': 1.9583e-06, 'epoch': 3.02, 'throughput': 1475.72}
688
-
689
- [INFO|callbacks.py:310] 2024-07-11 10:19:52,776 >> {'loss': 0.0477, 'learning_rate': 1.9667e-06, 'epoch': 3.04, 'throughput': 1475.66}
690
-
691
- [INFO|callbacks.py:310] 2024-07-11 10:20:03,890 >> {'loss': 0.0356, 'learning_rate': 1.9750e-06, 'epoch': 3.05, 'throughput': 1475.41}
692
-
693
- [INFO|callbacks.py:310] 2024-07-11 10:20:14,994 >> {'loss': 0.0324, 'learning_rate': 1.9833e-06, 'epoch': 3.06, 'throughput': 1475.56}
694
-
695
- [INFO|callbacks.py:310] 2024-07-11 10:20:26,140 >> {'loss': 0.0428, 'learning_rate': 1.9917e-06, 'epoch': 3.07, 'throughput': 1475.76}
696
-
697
- [INFO|callbacks.py:310] 2024-07-11 10:20:37,270 >> {'loss': 0.0315, 'learning_rate': 2.0000e-06, 'epoch': 3.09, 'throughput': 1475.75}
698
-
699
- [INFO|callbacks.py:310] 2024-07-11 10:20:48,387 >> {'loss': 0.0366, 'learning_rate': 2.0083e-06, 'epoch': 3.10, 'throughput': 1475.78}
700
-
701
- [INFO|callbacks.py:310] 2024-07-11 10:20:59,460 >> {'loss': 0.0307, 'learning_rate': 2.0167e-06, 'epoch': 3.11, 'throughput': 1475.47}
702
-
703
- [INFO|callbacks.py:310] 2024-07-11 10:21:10,586 >> {'loss': 0.0338, 'learning_rate': 2.0250e-06, 'epoch': 3.13, 'throughput': 1475.52}
704
-
705
- [INFO|callbacks.py:310] 2024-07-11 10:21:21,705 >> {'loss': 0.0348, 'learning_rate': 2.0333e-06, 'epoch': 3.14, 'throughput': 1475.62}
706
-
707
- [INFO|callbacks.py:310] 2024-07-11 10:21:32,815 >> {'loss': 0.0355, 'learning_rate': 2.0417e-06, 'epoch': 3.15, 'throughput': 1475.75}
708
-
709
- [INFO|callbacks.py:310] 2024-07-11 10:21:43,960 >> {'loss': 0.0272, 'learning_rate': 2.0500e-06, 'epoch': 3.16, 'throughput': 1475.84}
710
-
711
- [INFO|callbacks.py:310] 2024-07-11 10:21:55,085 >> {'loss': 0.0464, 'learning_rate': 2.0583e-06, 'epoch': 3.18, 'throughput': 1475.83}
712
-
713
- [INFO|callbacks.py:310] 2024-07-11 10:22:06,205 >> {'loss': 0.0268, 'learning_rate': 2.0667e-06, 'epoch': 3.19, 'throughput': 1475.60}
714
-
715
- [INFO|callbacks.py:310] 2024-07-11 10:22:17,319 >> {'loss': 0.0229, 'learning_rate': 2.0750e-06, 'epoch': 3.20, 'throughput': 1475.38}
716
-
717
- [INFO|callbacks.py:310] 2024-07-11 10:22:28,439 >> {'loss': 0.0373, 'learning_rate': 2.0833e-06, 'epoch': 3.22, 'throughput': 1475.74}
718
-
719
- [INFO|callbacks.py:310] 2024-07-11 10:22:39,521 >> {'loss': 0.0361, 'learning_rate': 2.0917e-06, 'epoch': 3.23, 'throughput': 1475.59}
720
-
721
- [INFO|callbacks.py:310] 2024-07-11 10:22:50,624 >> {'loss': 0.0401, 'learning_rate': 2.1000e-06, 'epoch': 3.24, 'throughput': 1475.60}
722
-
723
- [INFO|callbacks.py:310] 2024-07-11 10:23:01,715 >> {'loss': 0.0262, 'learning_rate': 2.1083e-06, 'epoch': 3.25, 'throughput': 1475.38}
724
-
725
- [INFO|callbacks.py:310] 2024-07-11 10:23:12,825 >> {'loss': 0.0481, 'learning_rate': 2.1167e-06, 'epoch': 3.27, 'throughput': 1475.34}
726
-
727
- [INFO|callbacks.py:310] 2024-07-11 10:23:23,969 >> {'loss': 0.0417, 'learning_rate': 2.1250e-06, 'epoch': 3.28, 'throughput': 1475.21}
728
-
729
- [INFO|callbacks.py:310] 2024-07-11 10:23:35,127 >> {'loss': 0.0457, 'learning_rate': 2.1333e-06, 'epoch': 3.29, 'throughput': 1475.48}
730
-
731
- [INFO|callbacks.py:310] 2024-07-11 10:23:46,239 >> {'loss': 0.0175, 'learning_rate': 2.1417e-06, 'epoch': 3.31, 'throughput': 1475.19}
732
-
733
- [INFO|callbacks.py:310] 2024-07-11 10:23:57,391 >> {'loss': 0.0371, 'learning_rate': 2.1500e-06, 'epoch': 3.32, 'throughput': 1475.40}
734
-
735
- [INFO|callbacks.py:310] 2024-07-11 10:24:08,512 >> {'loss': 0.0268, 'learning_rate': 2.1583e-06, 'epoch': 3.33, 'throughput': 1475.58}
736
-
737
- [INFO|callbacks.py:310] 2024-07-11 10:24:19,598 >> {'loss': 0.0417, 'learning_rate': 2.1667e-06, 'epoch': 3.34, 'throughput': 1475.58}
738
-
739
- [INFO|callbacks.py:310] 2024-07-11 10:24:30,714 >> {'loss': 0.0264, 'learning_rate': 2.1750e-06, 'epoch': 3.36, 'throughput': 1475.32}
740
-
741
- [INFO|callbacks.py:310] 2024-07-11 10:24:41,829 >> {'loss': 0.0336, 'learning_rate': 2.1833e-06, 'epoch': 3.37, 'throughput': 1475.42}
742
-
743
- [INFO|callbacks.py:310] 2024-07-11 10:24:52,937 >> {'loss': 0.0434, 'learning_rate': 2.1917e-06, 'epoch': 3.38, 'throughput': 1475.28}
744
-
745
- [INFO|callbacks.py:310] 2024-07-11 10:25:04,052 >> {'loss': 0.0389, 'learning_rate': 2.2000e-06, 'epoch': 3.40, 'throughput': 1475.08}
746
-
747
- [INFO|callbacks.py:310] 2024-07-11 10:25:15,179 >> {'loss': 0.0450, 'learning_rate': 2.2083e-06, 'epoch': 3.41, 'throughput': 1474.84}
748
-
749
- [INFO|callbacks.py:310] 2024-07-11 10:25:26,284 >> {'loss': 0.0331, 'learning_rate': 2.2167e-06, 'epoch': 3.42, 'throughput': 1474.66}
750
-
751
- [INFO|callbacks.py:310] 2024-07-11 10:25:37,399 >> {'loss': 0.0228, 'learning_rate': 2.2250e-06, 'epoch': 3.43, 'throughput': 1474.73}
752
-
753
- [INFO|callbacks.py:310] 2024-07-11 10:25:48,485 >> {'loss': 0.0307, 'learning_rate': 2.2333e-06, 'epoch': 3.45, 'throughput': 1474.93}
754
-
755
- [INFO|callbacks.py:310] 2024-07-11 10:25:59,600 >> {'loss': 0.0332, 'learning_rate': 2.2417e-06, 'epoch': 3.46, 'throughput': 1474.79}
756
-
757
- [INFO|callbacks.py:310] 2024-07-11 10:26:10,724 >> {'loss': 0.0662, 'learning_rate': 2.2500e-06, 'epoch': 3.47, 'throughput': 1474.75}
758
-
759
- [INFO|callbacks.py:310] 2024-07-11 10:26:21,833 >> {'loss': 0.0431, 'learning_rate': 2.2583e-06, 'epoch': 3.49, 'throughput': 1474.76}
760
-
761
- [INFO|callbacks.py:310] 2024-07-11 10:26:32,988 >> {'loss': 0.0423, 'learning_rate': 2.2667e-06, 'epoch': 3.50, 'throughput': 1474.84}
762
-
763
- [INFO|callbacks.py:310] 2024-07-11 10:26:44,135 >> {'loss': 0.0447, 'learning_rate': 2.2750e-06, 'epoch': 3.51, 'throughput': 1474.86}
764
-
765
- [INFO|callbacks.py:310] 2024-07-11 10:26:55,240 >> {'loss': 0.0337, 'learning_rate': 2.2833e-06, 'epoch': 3.52, 'throughput': 1474.75}
766
-
767
- [INFO|callbacks.py:310] 2024-07-11 10:27:06,362 >> {'loss': 0.0319, 'learning_rate': 2.2917e-06, 'epoch': 3.54, 'throughput': 1474.61}
768
-
769
- [INFO|callbacks.py:310] 2024-07-11 10:27:17,507 >> {'loss': 0.0270, 'learning_rate': 2.3000e-06, 'epoch': 3.55, 'throughput': 1474.75}
770
-
771
- [INFO|callbacks.py:310] 2024-07-11 10:27:28,590 >> {'loss': 0.0244, 'learning_rate': 2.3083e-06, 'epoch': 3.56, 'throughput': 1474.91}
772
-
773
- [INFO|callbacks.py:310] 2024-07-11 10:27:39,702 >> {'loss': 0.0377, 'learning_rate': 2.3167e-06, 'epoch': 3.58, 'throughput': 1475.12}
774
-
775
- [INFO|callbacks.py:310] 2024-07-11 10:27:50,833 >> {'loss': 0.0477, 'learning_rate': 2.3250e-06, 'epoch': 3.59, 'throughput': 1475.12}
776
-
777
- [INFO|callbacks.py:310] 2024-07-11 10:28:01,921 >> {'loss': 0.0332, 'learning_rate': 2.3333e-06, 'epoch': 3.60, 'throughput': 1474.74}
778
-
779
- [INFO|callbacks.py:310] 2024-07-11 10:28:13,062 >> {'loss': 0.0285, 'learning_rate': 2.3417e-06, 'epoch': 3.61, 'throughput': 1474.97}
780
-
781
- [INFO|callbacks.py:310] 2024-07-11 10:28:24,197 >> {'loss': 0.0468, 'learning_rate': 2.3500e-06, 'epoch': 3.63, 'throughput': 1475.00}
782
-
783
- [INFO|callbacks.py:310] 2024-07-11 10:28:35,319 >> {'loss': 0.0370, 'learning_rate': 2.3583e-06, 'epoch': 3.64, 'throughput': 1475.13}
784
-
785
- [INFO|callbacks.py:310] 2024-07-11 10:28:46,466 >> {'loss': 0.0410, 'learning_rate': 2.3667e-06, 'epoch': 3.65, 'throughput': 1474.98}
786
-
787
- [INFO|callbacks.py:310] 2024-07-11 10:28:57,575 >> {'loss': 0.0461, 'learning_rate': 2.3750e-06, 'epoch': 3.67, 'throughput': 1475.10}
788
-
789
- [INFO|callbacks.py:310] 2024-07-11 10:29:08,687 >> {'loss': 0.0594, 'learning_rate': 2.3833e-06, 'epoch': 3.68, 'throughput': 1474.95}
790
-
791
- [INFO|callbacks.py:310] 2024-07-11 10:29:19,788 >> {'loss': 0.0728, 'learning_rate': 2.3917e-06, 'epoch': 3.69, 'throughput': 1475.07}
792
-
793
- [INFO|callbacks.py:310] 2024-07-11 10:29:30,916 >> {'loss': 0.0290, 'learning_rate': 2.4000e-06, 'epoch': 3.70, 'throughput': 1475.20}
794
-
795
- [INFO|callbacks.py:310] 2024-07-11 10:29:42,019 >> {'loss': 0.0412, 'learning_rate': 2.4083e-06, 'epoch': 3.72, 'throughput': 1475.44}
796
-
797
- [INFO|callbacks.py:310] 2024-07-11 10:29:53,160 >> {'loss': 0.0262, 'learning_rate': 2.4167e-06, 'epoch': 3.73, 'throughput': 1475.13}
798
-
799
- [INFO|callbacks.py:310] 2024-07-11 10:30:04,273 >> {'loss': 0.0796, 'learning_rate': 2.4250e-06, 'epoch': 3.74, 'throughput': 1474.96}
800
-
801
- [INFO|callbacks.py:310] 2024-07-11 10:30:15,400 >> {'loss': 0.0447, 'learning_rate': 2.4333e-06, 'epoch': 3.76, 'throughput': 1474.93}
802
-
803
- [INFO|callbacks.py:310] 2024-07-11 10:30:26,536 >> {'loss': 0.0252, 'learning_rate': 2.4417e-06, 'epoch': 3.77, 'throughput': 1474.95}
804
-
805
- [INFO|callbacks.py:310] 2024-07-11 10:30:37,624 >> {'loss': 0.0458, 'learning_rate': 2.4500e-06, 'epoch': 3.78, 'throughput': 1474.79}
806
-
807
- [INFO|callbacks.py:310] 2024-07-11 10:30:48,698 >> {'loss': 0.0431, 'learning_rate': 2.4583e-06, 'epoch': 3.79, 'throughput': 1474.62}
808
-
809
- [INFO|callbacks.py:310] 2024-07-11 10:30:59,827 >> {'loss': 0.0422, 'learning_rate': 2.4667e-06, 'epoch': 3.81, 'throughput': 1474.72}
810
-
811
- [INFO|callbacks.py:310] 2024-07-11 10:31:10,927 >> {'loss': 0.0428, 'learning_rate': 2.4750e-06, 'epoch': 3.82, 'throughput': 1474.85}
812
-
813
- [INFO|callbacks.py:310] 2024-07-11 10:31:22,040 >> {'loss': 0.0473, 'learning_rate': 2.4833e-06, 'epoch': 3.83, 'throughput': 1474.88}
814
-
815
- [INFO|callbacks.py:310] 2024-07-11 10:31:33,168 >> {'loss': 0.0223, 'learning_rate': 2.4917e-06, 'epoch': 3.85, 'throughput': 1474.81}
816
-
817
- [INFO|callbacks.py:310] 2024-07-11 10:31:44,304 >> {'loss': 0.0274, 'learning_rate': 2.5000e-06, 'epoch': 3.86, 'throughput': 1474.73}
818
-
819
- [INFO|callbacks.py:310] 2024-07-11 10:31:55,446 >> {'loss': 0.0454, 'learning_rate': 2.5083e-06, 'epoch': 3.87, 'throughput': 1474.68}
820
-
821
- [INFO|callbacks.py:310] 2024-07-11 10:32:06,565 >> {'loss': 0.0220, 'learning_rate': 2.5167e-06, 'epoch': 3.88, 'throughput': 1474.90}
822
-
823
- [INFO|callbacks.py:310] 2024-07-11 10:32:17,678 >> {'loss': 0.0357, 'learning_rate': 2.5250e-06, 'epoch': 3.90, 'throughput': 1474.97}
824
-
825
- [INFO|callbacks.py:310] 2024-07-11 10:32:28,794 >> {'loss': 0.0458, 'learning_rate': 2.5333e-06, 'epoch': 3.91, 'throughput': 1475.20}
826
-
827
- [INFO|callbacks.py:310] 2024-07-11 10:32:39,890 >> {'loss': 0.0454, 'learning_rate': 2.5417e-06, 'epoch': 3.92, 'throughput': 1475.13}
828
-
829
- [INFO|callbacks.py:310] 2024-07-11 10:32:51,000 >> {'loss': 0.0262, 'learning_rate': 2.5500e-06, 'epoch': 3.94, 'throughput': 1475.05}
830
-
831
- [INFO|callbacks.py:310] 2024-07-11 10:33:02,090 >> {'loss': 0.0234, 'learning_rate': 2.5583e-06, 'epoch': 3.95, 'throughput': 1474.86}
832
-
833
- [INFO|callbacks.py:310] 2024-07-11 10:33:13,206 >> {'loss': 0.0366, 'learning_rate': 2.5667e-06, 'epoch': 3.96, 'throughput': 1474.97}
834
-
835
- [INFO|callbacks.py:310] 2024-07-11 10:33:24,327 >> {'loss': 0.0236, 'learning_rate': 2.5750e-06, 'epoch': 3.97, 'throughput': 1474.97}
836
-
837
- [INFO|callbacks.py:310] 2024-07-11 10:33:35,481 >> {'loss': 0.0403, 'learning_rate': 2.5833e-06, 'epoch': 3.99, 'throughput': 1475.24}
838
-
839
- [INFO|callbacks.py:310] 2024-07-11 10:33:46,614 >> {'loss': 0.0395, 'learning_rate': 2.5917e-06, 'epoch': 4.00, 'throughput': 1475.24}
840
-
841
- [INFO|callbacks.py:310] 2024-07-11 10:33:57,694 >> {'loss': 0.0163, 'learning_rate': 2.6000e-06, 'epoch': 4.01, 'throughput': 1475.19}
842
-
843
- [INFO|callbacks.py:310] 2024-07-11 10:34:08,791 >> {'loss': 0.0181, 'learning_rate': 2.6083e-06, 'epoch': 4.03, 'throughput': 1475.00}
844
-
845
- [INFO|callbacks.py:310] 2024-07-11 10:34:19,898 >> {'loss': 0.0130, 'learning_rate': 2.6167e-06, 'epoch': 4.04, 'throughput': 1474.96}
846
-
847
- [INFO|callbacks.py:310] 2024-07-11 10:34:30,966 >> {'loss': 0.0178, 'learning_rate': 2.6250e-06, 'epoch': 4.05, 'throughput': 1474.78}
848
-
849
- [INFO|callbacks.py:310] 2024-07-11 10:34:42,123 >> {'loss': 0.0153, 'learning_rate': 2.6333e-06, 'epoch': 4.06, 'throughput': 1475.20}
850
-
851
- [INFO|callbacks.py:310] 2024-07-11 10:34:53,252 >> {'loss': 0.0205, 'learning_rate': 2.6417e-06, 'epoch': 4.08, 'throughput': 1475.09}
852
-
853
- [INFO|callbacks.py:310] 2024-07-11 10:35:04,398 >> {'loss': 0.0021, 'learning_rate': 2.6500e-06, 'epoch': 4.09, 'throughput': 1475.17}
854
-
855
- [INFO|callbacks.py:310] 2024-07-11 10:35:15,518 >> {'loss': 0.0220, 'learning_rate': 2.6583e-06, 'epoch': 4.10, 'throughput': 1475.20}
856
-
857
- [INFO|callbacks.py:310] 2024-07-11 10:35:26,625 >> {'loss': 0.0134, 'learning_rate': 2.6667e-06, 'epoch': 4.12, 'throughput': 1475.14}
858
-
859
- [INFO|callbacks.py:310] 2024-07-11 10:35:37,710 >> {'loss': 0.0175, 'learning_rate': 2.6750e-06, 'epoch': 4.13, 'throughput': 1474.95}
860
-
861
- [INFO|callbacks.py:310] 2024-07-11 10:35:48,853 >> {'loss': 0.0230, 'learning_rate': 2.6833e-06, 'epoch': 4.14, 'throughput': 1474.98}
862
-
863
- [INFO|callbacks.py:310] 2024-07-11 10:35:59,942 >> {'loss': 0.0303, 'learning_rate': 2.6917e-06, 'epoch': 4.15, 'throughput': 1474.77}
864
-
865
- [INFO|callbacks.py:310] 2024-07-11 10:36:11,072 >> {'loss': 0.0187, 'learning_rate': 2.7000e-06, 'epoch': 4.17, 'throughput': 1474.91}
866
-
867
- [INFO|callbacks.py:310] 2024-07-11 10:36:22,179 >> {'loss': 0.0126, 'learning_rate': 2.7083e-06, 'epoch': 4.18, 'throughput': 1474.71}
868
-
869
- [INFO|callbacks.py:310] 2024-07-11 10:36:33,308 >> {'loss': 0.0203, 'learning_rate': 2.7167e-06, 'epoch': 4.19, 'throughput': 1474.67}
870
-
871
- [INFO|callbacks.py:310] 2024-07-11 10:36:44,410 >> {'loss': 0.0078, 'learning_rate': 2.7250e-06, 'epoch': 4.21, 'throughput': 1474.62}
872
-
873
- [INFO|callbacks.py:310] 2024-07-11 10:36:55,536 >> {'loss': 0.0165, 'learning_rate': 2.7333e-06, 'epoch': 4.22, 'throughput': 1474.50}
874
-
875
- [INFO|callbacks.py:310] 2024-07-11 10:37:06,705 >> {'loss': 0.0113, 'learning_rate': 2.7417e-06, 'epoch': 4.23, 'throughput': 1474.78}
876
-
877
- [INFO|callbacks.py:310] 2024-07-11 10:37:17,796 >> {'loss': 0.0058, 'learning_rate': 2.7500e-06, 'epoch': 4.24, 'throughput': 1474.81}
878
-
879
- [INFO|callbacks.py:310] 2024-07-11 10:37:28,878 >> {'loss': 0.0070, 'learning_rate': 2.7583e-06, 'epoch': 4.26, 'throughput': 1474.67}
880
-
881
- [INFO|callbacks.py:310] 2024-07-11 10:37:39,999 >> {'loss': 0.0270, 'learning_rate': 2.7667e-06, 'epoch': 4.27, 'throughput': 1474.84}
882
-
883
- [INFO|callbacks.py:310] 2024-07-11 10:37:51,079 >> {'loss': 0.0276, 'learning_rate': 2.7750e-06, 'epoch': 4.28, 'throughput': 1474.61}
884
-
885
- [INFO|callbacks.py:310] 2024-07-11 10:38:02,192 >> {'loss': 0.0367, 'learning_rate': 2.7833e-06, 'epoch': 4.30, 'throughput': 1474.55}
886
-
887
- [INFO|callbacks.py:310] 2024-07-11 10:38:13,299 >> {'loss': 0.0161, 'learning_rate': 2.7917e-06, 'epoch': 4.31, 'throughput': 1474.44}
888
-
889
- [INFO|callbacks.py:310] 2024-07-11 10:38:24,431 >> {'loss': 0.0180, 'learning_rate': 2.8000e-06, 'epoch': 4.32, 'throughput': 1474.57}
890
-
891
- [INFO|callbacks.py:310] 2024-07-11 10:38:35,563 >> {'loss': 0.0044, 'learning_rate': 2.8083e-06, 'epoch': 4.33, 'throughput': 1474.64}
892
-
893
- [INFO|callbacks.py:310] 2024-07-11 10:38:46,691 >> {'loss': 0.0109, 'learning_rate': 2.8167e-06, 'epoch': 4.35, 'throughput': 1474.75}
894
-
895
- [INFO|callbacks.py:310] 2024-07-11 10:38:57,770 >> {'loss': 0.0173, 'learning_rate': 2.8250e-06, 'epoch': 4.36, 'throughput': 1474.67}
896
-
897
- [INFO|callbacks.py:310] 2024-07-11 10:39:08,939 >> {'loss': 0.0107, 'learning_rate': 2.8333e-06, 'epoch': 4.37, 'throughput': 1474.70}
898
-
899
- [INFO|callbacks.py:310] 2024-07-11 10:39:20,028 >> {'loss': 0.0116, 'learning_rate': 2.8417e-06, 'epoch': 4.39, 'throughput': 1474.55}
900
-
901
- [INFO|callbacks.py:310] 2024-07-11 10:39:31,145 >> {'loss': 0.0281, 'learning_rate': 2.8500e-06, 'epoch': 4.40, 'throughput': 1474.78}
902
-
903
- [INFO|callbacks.py:310] 2024-07-11 10:39:42,249 >> {'loss': 0.0246, 'learning_rate': 2.8583e-06, 'epoch': 4.41, 'throughput': 1474.86}
904
-
905
- [INFO|callbacks.py:310] 2024-07-11 10:39:53,373 >> {'loss': 0.0146, 'learning_rate': 2.8667e-06, 'epoch': 4.42, 'throughput': 1474.84}
906
-
907
- [INFO|callbacks.py:310] 2024-07-11 10:40:04,489 >> {'loss': 0.0439, 'learning_rate': 2.8750e-06, 'epoch': 4.44, 'throughput': 1474.62}
908
-
909
- [INFO|callbacks.py:310] 2024-07-11 10:40:15,615 >> {'loss': 0.0279, 'learning_rate': 2.8833e-06, 'epoch': 4.45, 'throughput': 1474.60}
910
-
911
- [INFO|callbacks.py:310] 2024-07-11 10:40:26,740 >> {'loss': 0.0276, 'learning_rate': 2.8917e-06, 'epoch': 4.46, 'throughput': 1474.70}
912
-
913
- [INFO|callbacks.py:310] 2024-07-11 10:40:37,836 >> {'loss': 0.0167, 'learning_rate': 2.9000e-06, 'epoch': 4.48, 'throughput': 1474.63}
914
-
915
- [INFO|callbacks.py:310] 2024-07-11 10:40:48,963 >> {'loss': 0.0258, 'learning_rate': 2.9083e-06, 'epoch': 4.49, 'throughput': 1474.83}
916
-
917
- [INFO|callbacks.py:310] 2024-07-11 10:41:00,055 >> {'loss': 0.0306, 'learning_rate': 2.9167e-06, 'epoch': 4.50, 'throughput': 1474.91}
918
-
919
- [INFO|callbacks.py:310] 2024-07-11 10:41:11,182 >> {'loss': 0.0395, 'learning_rate': 2.9250e-06, 'epoch': 4.51, 'throughput': 1475.01}
920
-
921
- [INFO|callbacks.py:310] 2024-07-11 10:41:22,328 >> {'loss': 0.0203, 'learning_rate': 2.9333e-06, 'epoch': 4.53, 'throughput': 1475.06}
922
-
923
- [INFO|callbacks.py:310] 2024-07-11 10:41:33,436 >> {'loss': 0.0241, 'learning_rate': 2.9417e-06, 'epoch': 4.54, 'throughput': 1475.01}
924
-
925
- [INFO|callbacks.py:310] 2024-07-11 10:41:44,523 >> {'loss': 0.0151, 'learning_rate': 2.9500e-06, 'epoch': 4.55, 'throughput': 1474.68}
926
-
927
- [INFO|callbacks.py:310] 2024-07-11 10:41:55,641 >> {'loss': 0.0355, 'learning_rate': 2.9583e-06, 'epoch': 4.57, 'throughput': 1474.59}
928
-
929
- [INFO|callbacks.py:310] 2024-07-11 10:42:06,750 >> {'loss': 0.0093, 'learning_rate': 2.9667e-06, 'epoch': 4.58, 'throughput': 1474.72}
930
-
931
- [INFO|callbacks.py:310] 2024-07-11 10:42:17,848 >> {'loss': 0.0314, 'learning_rate': 2.9750e-06, 'epoch': 4.59, 'throughput': 1474.75}
932
-
933
- [INFO|callbacks.py:310] 2024-07-11 10:42:28,952 >> {'loss': 0.0155, 'learning_rate': 2.9833e-06, 'epoch': 4.60, 'throughput': 1474.77}
934
-
935
- [INFO|callbacks.py:310] 2024-07-11 10:42:40,070 >> {'loss': 0.0138, 'learning_rate': 2.9917e-06, 'epoch': 4.62, 'throughput': 1474.94}
936
-
937
- [INFO|callbacks.py:310] 2024-07-11 10:42:51,212 >> {'loss': 0.0212, 'learning_rate': 3.0000e-06, 'epoch': 4.63, 'throughput': 1475.10}
938
-
939
- [INFO|callbacks.py:310] 2024-07-11 10:43:02,328 >> {'loss': 0.0255, 'learning_rate': 3.0083e-06, 'epoch': 4.64, 'throughput': 1475.11}
940
-
941
- [INFO|callbacks.py:310] 2024-07-11 10:43:13,439 >> {'loss': 0.0231, 'learning_rate': 3.0167e-06, 'epoch': 4.66, 'throughput': 1474.99}
942
-
943
- [INFO|callbacks.py:310] 2024-07-11 10:43:24,561 >> {'loss': 0.0083, 'learning_rate': 3.0250e-06, 'epoch': 4.67, 'throughput': 1474.99}
944
-
945
- [INFO|callbacks.py:310] 2024-07-11 10:43:35,710 >> {'loss': 0.0122, 'learning_rate': 3.0333e-06, 'epoch': 4.68, 'throughput': 1475.18}
946
-
947
- [INFO|callbacks.py:310] 2024-07-11 10:43:46,811 >> {'loss': 0.0142, 'learning_rate': 3.0417e-06, 'epoch': 4.69, 'throughput': 1475.37}
948
-
949
- [INFO|callbacks.py:310] 2024-07-11 10:43:57,895 >> {'loss': 0.0293, 'learning_rate': 3.0500e-06, 'epoch': 4.71, 'throughput': 1475.24}
950
-
951
- [INFO|callbacks.py:310] 2024-07-11 10:44:09,006 >> {'loss': 0.0292, 'learning_rate': 3.0583e-06, 'epoch': 4.72, 'throughput': 1475.50}
952
-
953
- [INFO|callbacks.py:310] 2024-07-11 10:44:20,120 >> {'loss': 0.0320, 'learning_rate': 3.0667e-06, 'epoch': 4.73, 'throughput': 1475.46}
954
-
955
- [INFO|callbacks.py:310] 2024-07-11 10:44:31,238 >> {'loss': 0.0189, 'learning_rate': 3.0750e-06, 'epoch': 4.75, 'throughput': 1475.50}
956
-
957
- [INFO|callbacks.py:310] 2024-07-11 10:44:42,405 >> {'loss': 0.0220, 'learning_rate': 3.0833e-06, 'epoch': 4.76, 'throughput': 1475.57}
958
-
959
- [INFO|callbacks.py:310] 2024-07-11 10:44:53,524 >> {'loss': 0.0242, 'learning_rate': 3.0917e-06, 'epoch': 4.77, 'throughput': 1475.68}
960
-
961
- [INFO|callbacks.py:310] 2024-07-11 10:45:04,670 >> {'loss': 0.0150, 'learning_rate': 3.1000e-06, 'epoch': 4.78, 'throughput': 1475.82}
962
-
963
- [INFO|callbacks.py:310] 2024-07-11 10:45:15,790 >> {'loss': 0.0154, 'learning_rate': 3.1083e-06, 'epoch': 4.80, 'throughput': 1475.81}
964
-
965
- [INFO|callbacks.py:310] 2024-07-11 10:45:26,907 >> {'loss': 0.0195, 'learning_rate': 3.1167e-06, 'epoch': 4.81, 'throughput': 1475.92}
966
-
967
- [INFO|callbacks.py:310] 2024-07-11 10:45:38,027 >> {'loss': 0.0310, 'learning_rate': 3.1250e-06, 'epoch': 4.82, 'throughput': 1476.19}
968
-
969
- [INFO|callbacks.py:310] 2024-07-11 10:45:49,107 >> {'loss': 0.0376, 'learning_rate': 3.1333e-06, 'epoch': 4.84, 'throughput': 1476.09}
970
 
971
- [INFO|callbacks.py:310] 2024-07-11 10:46:00,236 >> {'loss': 0.0093, 'learning_rate': 3.1417e-06, 'epoch': 4.85, 'throughput': 1476.24}
972
 
973
- [INFO|callbacks.py:310] 2024-07-11 10:46:11,360 >> {'loss': 0.0270, 'learning_rate': 3.1500e-06, 'epoch': 4.86, 'throughput': 1476.22}
 
 
 
 
 
 
 
 
974
 
975
- [INFO|callbacks.py:310] 2024-07-11 10:46:22,487 >> {'loss': 0.0191, 'learning_rate': 3.1583e-06, 'epoch': 4.87, 'throughput': 1476.16}
976
 
977
- [INFO|callbacks.py:310] 2024-07-11 10:46:33,618 >> {'loss': 0.0132, 'learning_rate': 3.1667e-06, 'epoch': 4.89, 'throughput': 1476.06}
978
 
979
- [INFO|callbacks.py:310] 2024-07-11 10:46:44,735 >> {'loss': 0.0111, 'learning_rate': 3.1750e-06, 'epoch': 4.90, 'throughput': 1475.94}
980
 
981
- [INFO|callbacks.py:310] 2024-07-11 10:46:55,863 >> {'loss': 0.0145, 'learning_rate': 3.1833e-06, 'epoch': 4.91, 'throughput': 1475.98}
982
 
983
- [INFO|callbacks.py:310] 2024-07-11 10:47:06,943 >> {'loss': 0.0082, 'learning_rate': 3.1917e-06, 'epoch': 4.93, 'throughput': 1475.97}
984
 
985
- [INFO|callbacks.py:310] 2024-07-11 10:47:18,028 >> {'loss': 0.0165, 'learning_rate': 3.2000e-06, 'epoch': 4.94, 'throughput': 1475.81}
986
 
987
- [INFO|callbacks.py:310] 2024-07-11 10:47:29,154 >> {'loss': 0.0243, 'learning_rate': 3.2083e-06, 'epoch': 4.95, 'throughput': 1476.08}
988
 
989
- [INFO|trainer.py:3478] 2024-07-11 10:47:35,554 >> Saving model checkpoint to saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385
990
 
991
- [INFO|configuration_utils.py:472] 2024-07-11 10:47:35,557 >> Configuration saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385/config.json
992
 
993
- [INFO|configuration_utils.py:769] 2024-07-11 10:47:35,558 >> Configuration saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385/generation_config.json
994
 
995
- [INFO|modeling_utils.py:2698] 2024-07-11 10:47:49,264 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385/model.safetensors.index.json.
996
 
997
- [INFO|tokenization_utils_base.py:2574] 2024-07-11 10:47:49,265 >> tokenizer config file saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385/tokenizer_config.json
998
 
999
- [INFO|tokenization_utils_base.py:2583] 2024-07-11 10:47:49,265 >> Special tokens file saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/checkpoint-385/special_tokens_map.json
1000
 
1001
- [INFO|trainer.py:2383] 2024-07-11 10:48:19,619 >>
1002
 
1003
- Training completed. Do not forget to share your model on huggingface.co/models =)
 
1004
 
 
1005
 
 
1006
 
1007
- [INFO|trainer.py:3478] 2024-07-11 10:48:26,073 >> Saving model checkpoint to saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth
1008
 
1009
- [INFO|configuration_utils.py:472] 2024-07-11 10:48:26,076 >> Configuration saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/config.json
1010
 
1011
- [INFO|configuration_utils.py:769] 2024-07-11 10:48:26,076 >> Configuration saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/generation_config.json
1012
 
1013
- [INFO|modeling_utils.py:2698] 2024-07-11 10:48:39,859 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 3 checkpoint shards. You can find where each parameters has been saved in the index located at saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/model.safetensors.index.json.
1014
 
1015
- [INFO|tokenization_utils_base.py:2574] 2024-07-11 10:48:39,860 >> tokenizer config file saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/tokenizer_config.json
1016
 
1017
- [INFO|tokenization_utils_base.py:2583] 2024-07-11 10:48:39,860 >> Special tokens file saved in saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/special_tokens_map.json
1018
 
1019
- [WARNING|ploting.py:89] 2024-07-11 10:48:40,948 >> No metric eval_loss to plot.
1020
 
1021
- [WARNING|ploting.py:89] 2024-07-11 10:48:40,949 >> No metric eval_accuracy to plot.
1022
 
1023
- [INFO|modelcard.py:449] 2024-07-11 10:48:40,950 >> Dropping the following result as it does not have all the necessary fields:
1024
- {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
1025
 
 
1
+ [INFO|parser.py:325] 2024-07-11 11:00:10,231 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: None
2
 
3
+ [INFO|tokenization_utils_base.py:2159] 2024-07-11 11:00:10,234 >> loading file tokenizer.model
4
 
5
+ 07/11/2024 11:00:10 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: None
6
 
7
+ 07/11/2024 11:00:10 - INFO - llamafactory.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: None
8
 
9
+ [INFO|tokenization_utils_base.py:2159] 2024-07-11 11:00:10,234 >> loading file tokenizer.json
10
 
11
+ [INFO|tokenization_utils_base.py:2159] 2024-07-11 11:00:10,234 >> loading file added_tokens.json
12
 
13
+ [INFO|tokenization_utils_base.py:2159] 2024-07-11 11:00:10,234 >> loading file special_tokens_map.json
14
 
15
+ [INFO|tokenization_utils_base.py:2159] 2024-07-11 11:00:10,235 >> loading file tokenizer_config.json
16
 
17
+ [INFO|loader.py:50] 2024-07-11 11:00:10,286 >> Loading dataset dev_output.json...
18
 
19
+ 07/11/2024 11:00:10 - INFO - llamafactory.hparams.parser - Process rank: 7, device: cuda:7, n_gpu: 1, distributed training: True, compute dtype: None
20
 
21
+ 07/11/2024 11:00:10 - INFO - llamafactory.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: None
22
 
23
+ 07/11/2024 11:00:10 - INFO - llamafactory.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: None
24
 
25
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
26
 
27
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
28
 
29
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
30
 
31
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
32
 
33
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
34
 
35
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
36
 
37
+ 07/11/2024 11:00:11 - INFO - llamafactory.data.loader - Loading dataset dev_output.json...
38
 
39
+ [INFO|configuration_utils.py:731] 2024-07-11 11:00:12,504 >> loading configuration file saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/config.json
40
 
41
+ [INFO|configuration_utils.py:800] 2024-07-11 11:00:12,505 >> Model config LlamaConfig {
42
+ "_name_or_path": "saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  "architectures": [
44
  "LlamaForCausalLM"
45
  ],
 
62
  "rope_scaling": null,
63
  "rope_theta": 10000.0,
64
  "tie_word_embeddings": false,
65
+ "torch_dtype": "bfloat16",
66
  "transformers_version": "4.42.3",
67
+ "use_cache": false,
68
  "vocab_size": 32000
69
  }
70
 
71
 
72
+ [INFO|patcher.py:81] 2024-07-11 11:00:12,505 >> Using KV cache for faster generation.
 
 
 
 
 
 
 
 
 
 
73
 
74
+ [INFO|modeling_utils.py:3553] 2024-07-11 11:00:12,529 >> loading weights file saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/model.safetensors.index.json
75
 
76
+ [INFO|modeling_utils.py:1531] 2024-07-11 11:00:12,529 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
 
77
 
78
+ [INFO|configuration_utils.py:1000] 2024-07-11 11:00:12,531 >> Generate config GenerationConfig {
 
 
79
  "bos_token_id": 1,
80
+ "eos_token_id": 2
 
 
 
 
 
81
  }
82
 
83
 
84
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
87
 
88
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
89
 
90
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
91
 
92
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
93
 
94
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
95
 
96
+ 07/11/2024 11:00:12 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.
97
 
98
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
99
 
100
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
101
 
102
+ [INFO|modeling_utils.py:4364] 2024-07-11 11:00:16,788 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
103
 
 
104
 
105
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
106
 
107
+ [INFO|modeling_utils.py:4372] 2024-07-11 11:00:16,788 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth.
108
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ [INFO|configuration_utils.py:953] 2024-07-11 11:00:16,792 >> loading configuration file saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth/generation_config.json
111
 
112
+ [INFO|configuration_utils.py:1000] 2024-07-11 11:00:16,792 >> Generate config GenerationConfig {
113
+ "bos_token_id": 1,
114
+ "do_sample": true,
115
+ "eos_token_id": 2,
116
+ "max_length": 4096,
117
+ "pad_token_id": 0,
118
+ "temperature": 0.6,
119
+ "top_p": 0.9
120
+ }
121
 
 
122
 
123
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
124
 
125
+ [INFO|attention.py:80] 2024-07-11 11:00:16,798 >> Using torch SDPA for faster training and inference.
126
 
127
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
128
 
129
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
130
 
131
+ [INFO|loader.py:196] 2024-07-11 11:00:16,802 >> all params: 6,738,415,616
132
 
133
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
134
 
135
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
136
 
137
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
138
 
139
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
140
 
141
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
142
 
143
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
144
 
145
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
146
 
147
+ 07/11/2024 11:00:16 - INFO - llamafactory.model.loader - all params: 6,738,415,616
148
 
149
+ [INFO|trainer.py:3788] 2024-07-11 11:00:16,914 >>
150
+ ***** Running Prediction *****
151
 
152
+ [INFO|trainer.py:3790] 2024-07-11 11:00:16,914 >> Num examples = 2554
153
 
154
+ [INFO|trainer.py:3793] 2024-07-11 11:00:16,914 >> Batch size = 2
155
 
156
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
157
 
158
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
159
 
160
+ [WARNING|logging.py:328] 2024-07-11 11:00:17,582 >> We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
161
 
162
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
163
 
164
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
165
 
166
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
167
 
168
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
169
 
170
+ 07/11/2024 11:00:17 - WARNING - transformers.models.llama.modeling_llama - We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
171
 
172
+ [INFO|trainer.py:127] 2024-07-11 11:00:34,679 >> Saving prediction results to saves/LLaMA2-7B-Chat/full/eval_2024-07-11-10-49-45/generated_predictions.jsonl
 
173
 
trainer_log.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.yaml CHANGED
@@ -1,30 +1,18 @@
1
- bf16: true
2
  cutoff_len: 1024
3
- dataset: truth_train
4
  dataset_dir: data
5
- ddp_timeout: 180000000
6
- deepspeed: cache/ds_z2_config.json
7
- do_train: true
8
  finetuning_type: full
9
  flash_attn: auto
10
- gradient_accumulation_steps: 8
11
- include_num_input_tokens_seen: true
12
- learning_rate: 5.0e-06
13
- logging_steps: 1
14
- lr_scheduler_type: cosine
15
- max_grad_norm: 1.0
16
  max_samples: 100000
17
- model_name_or_path: meta-llama/Llama-2-7b-chat-hf
18
- num_train_epochs: 5.0
19
- optim: adamw_torch
20
- output_dir: saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth
21
- packing: false
22
- per_device_train_batch_size: 4
23
- plot_loss: true
24
  preprocessing_num_workers: 16
25
  quantization_method: bitsandbytes
26
- report_to: none
27
- save_steps: 980
28
  stage: sft
 
29
  template: llama2
30
- warmup_steps: 600
 
 
1
  cutoff_len: 1024
2
+ dataset: truth_dev
3
  dataset_dir: data
4
+ do_predict: true
 
 
5
  finetuning_type: full
6
  flash_attn: auto
7
+ max_new_tokens: 512
 
 
 
 
 
8
  max_samples: 100000
9
+ model_name_or_path: saves/LLaMA2-7B-Chat/full/train_2024-07-11-09-30-54_llama2_inst_truth
10
+ output_dir: saves/LLaMA2-7B-Chat/full/eval_2024-07-11-10-49-45
11
+ per_device_eval_batch_size: 2
12
+ predict_with_generate: true
 
 
 
13
  preprocessing_num_workers: 16
14
  quantization_method: bitsandbytes
 
 
15
  stage: sft
16
+ temperature: 0.95
17
  template: llama2
18
+ top_p: 0.7