Feature Extraction
Transformers
Safetensors
Chinese
internlm2
custom_code
RioLee commited on
Commit
519268e
1 Parent(s): ff5cb56

Delete logs.txt

Browse files
Files changed (1) hide show
  1. logs.txt +0 -981
logs.txt DELETED
@@ -1,981 +0,0 @@
1
- [2024-02-01 14:20:07,768] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2
- [2024-02-01 14:20:09,368] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
3
- [2024-02-01 14:20:09,369] [INFO] [runner.py:568:main] cmd = /home/lirenhao/anaconda3/envs/llama_factory/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=2345 --enable_each_rank_log=None /home/lirenhao/projects/LLaMA-Factory/src/train_bash.py --deepspeed ds_config.json --stage sft --model_name_or_path /home/lirenhao/pretrained_models/internlm2-chat-7b/ --do_train --dataset cpsycoun --template intern2 --finetuning_type full --lora_target wqkv --output_dir /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9 --overwrite_cache --overwrite_output_dir --per_device_train_batch_size 4 --gradient_accumulation_steps 28 --lr_scheduler_type cosine --logging_steps 10 --save_steps 21 --learning_rate 1e-6 --num_train_epochs 9.0 --plot_loss --fp16
4
- [2024-02-01 14:20:12,819] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
5
- [2024-02-01 14:20:14,435] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
6
- [2024-02-01 14:20:14,436] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0
7
- [2024-02-01 14:20:14,436] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
8
- [2024-02-01 14:20:14,436] [INFO] [launch.py:163:main] dist_world_size=4
9
- [2024-02-01 14:20:14,436] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
10
- [2024-02-01 14:20:19,797] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
11
- [2024-02-01 14:20:20,069] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
12
- [2024-02-01 14:20:20,128] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
13
- [2024-02-01 14:20:20,157] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
14
- [2024-02-01 14:20:22,839] [INFO] [comm.py:637:init_distributed] cdb=None
15
- [2024-02-01 14:20:23,347] [INFO] [comm.py:637:init_distributed] cdb=None
16
- [2024-02-01 14:20:23,364] [INFO] [comm.py:637:init_distributed] cdb=None
17
- [2024-02-01 14:20:23,375] [INFO] [comm.py:637:init_distributed] cdb=None
18
- [2024-02-01 14:20:23,376] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
19
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1
20
- distributed training: True, compute dtype: torch.float16
21
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
22
- _n_gpu=1,
23
- adafactor=False,
24
- adam_beta1=0.9,
25
- adam_beta2=0.999,
26
- adam_epsilon=1e-08,
27
- auto_find_batch_size=False,
28
- bf16=False,
29
- bf16_full_eval=False,
30
- data_seed=None,
31
- dataloader_drop_last=False,
32
- dataloader_num_workers=0,
33
- dataloader_persistent_workers=False,
34
- dataloader_pin_memory=True,
35
- ddp_backend=None,
36
- ddp_broadcast_buffers=None,
37
- ddp_bucket_cap_mb=None,
38
- ddp_find_unused_parameters=None,
39
- ddp_timeout=1800,
40
- debug=[],
41
- deepspeed=ds_config.json,
42
- disable_tqdm=False,
43
- dispatch_batches=None,
44
- do_eval=False,
45
- do_predict=False,
46
- do_train=True,
47
- eval_accumulation_steps=None,
48
- eval_delay=0,
49
- eval_steps=None,
50
- evaluation_strategy=no,
51
- fp16=True,
52
- fp16_backend=auto,
53
- fp16_full_eval=False,
54
- fp16_opt_level=O1,
55
- fsdp=[],
56
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
57
- fsdp_min_num_params=0,
58
- fsdp_transformer_layer_cls_to_wrap=None,
59
- full_determinism=False,
60
- generation_config=None,
61
- generation_max_length=None,
62
- generation_num_beams=None,
63
- gradient_accumulation_steps=28,
64
- gradient_checkpointing=False,
65
- gradient_checkpointing_kwargs=None,
66
- greater_is_better=None,
67
- group_by_length=False,
68
- half_precision_backend=auto,
69
- hub_always_push=False,
70
- hub_model_id=None,
71
- hub_private_repo=False,
72
- hub_strategy=every_save,
73
- hub_token=<HUB_TOKEN>,
74
- ignore_data_skip=False,
75
- include_inputs_for_metrics=False,
76
- include_num_input_tokens_seen=False,
77
- include_tokens_per_second=False,
78
- jit_mode_eval=False,
79
- label_names=None,
80
- label_smoothing_factor=0.0,
81
- learning_rate=1e-06,
82
- length_column_name=length,
83
- load_best_model_at_end=False,
84
- local_rank=2,
85
- log_level=passive,
86
- log_level_replica=warning,
87
- log_on_each_node=True,
88
- logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-22_siat-a100-4-02,
89
- logging_first_step=False,
90
- logging_nan_inf_filter=True,
91
- logging_steps=10,
92
- logging_strategy=steps,
93
- lr_scheduler_kwargs={},
94
- lr_scheduler_type=cosine,
95
- max_grad_norm=1.0,
96
- max_steps=-1,
97
- metric_for_best_model=None,
98
- mp_parameters=,
99
- neftune_noise_alpha=None,
100
- no_cuda=False,
101
- num_train_epochs=9.0,
102
- optim=adamw_torch,
103
- optim_args=None,
104
- output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
105
- overwrite_output_dir=True,
106
- past_index=-1,
107
- per_device_eval_batch_size=8,
108
- per_device_train_batch_size=4,
109
- predict_with_generate=False,
110
- prediction_loss_only=False,
111
- push_to_hub=False,
112
- push_to_hub_model_id=None,
113
- push_to_hub_organization=None,
114
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
115
- ray_scope=last,
116
- remove_unused_columns=True,
117
- report_to=[],
118
- resume_from_checkpoint=None,
119
- run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
120
- save_on_each_node=False,
121
- save_only_model=False,
122
- save_safetensors=True,
123
- save_steps=21,
124
- save_strategy=steps,
125
- save_total_limit=None,
126
- seed=42,
127
- skip_memory_metrics=True,
128
- sortish_sampler=False,
129
- split_batches=False,
130
- tf32=None,
131
- torch_compile=False,
132
- torch_compile_backend=None,
133
- torch_compile_mode=None,
134
- torchdynamo=None,
135
- tpu_metrics_debug=False,
136
- tpu_num_cores=None,
137
- use_cpu=False,
138
- use_ipex=False,
139
- use_legacy_prediction_loop=False,
140
- use_mps_device=False,
141
- warmup_ratio=0.0,
142
- warmup_steps=0,
143
- weight_decay=0.0,
144
- )
145
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1
146
- distributed training: True, compute dtype: torch.float16
147
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
148
- _n_gpu=1,
149
- adafactor=False,
150
- adam_beta1=0.9,
151
- adam_beta2=0.999,
152
- adam_epsilon=1e-08,
153
- auto_find_batch_size=False,
154
- bf16=False,
155
- bf16_full_eval=False,
156
- data_seed=None,
157
- dataloader_drop_last=False,
158
- dataloader_num_workers=0,
159
- dataloader_persistent_workers=False,
160
- dataloader_pin_memory=True,
161
- ddp_backend=None,
162
- ddp_broadcast_buffers=None,
163
- ddp_bucket_cap_mb=None,
164
- ddp_find_unused_parameters=None,
165
- ddp_timeout=1800,
166
- debug=[],
167
- deepspeed=ds_config.json,
168
- disable_tqdm=False,
169
- dispatch_batches=None,
170
- do_eval=False,
171
- do_predict=False,
172
- do_train=True,
173
- eval_accumulation_steps=None,
174
- eval_delay=0,
175
- eval_steps=None,
176
- evaluation_strategy=no,
177
- fp16=True,
178
- fp16_backend=auto,
179
- fp16_full_eval=False,
180
- fp16_opt_level=O1,
181
- fsdp=[],
182
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
183
- fsdp_min_num_params=0,
184
- fsdp_transformer_layer_cls_to_wrap=None,
185
- full_determinism=False,
186
- generation_config=None,
187
- generation_max_length=None,
188
- generation_num_beams=None,
189
- gradient_accumulation_steps=28,
190
- gradient_checkpointing=False,
191
- gradient_checkpointing_kwargs=None,
192
- greater_is_better=None,
193
- group_by_length=False,
194
- half_precision_backend=auto,
195
- hub_always_push=False,
196
- hub_model_id=None,
197
- hub_private_repo=False,
198
- hub_strategy=every_save,
199
- hub_token=<HUB_TOKEN>,
200
- ignore_data_skip=False,
201
- include_inputs_for_metrics=False,
202
- include_num_input_tokens_seen=False,
203
- include_tokens_per_second=False,
204
- jit_mode_eval=False,
205
- label_names=None,
206
- label_smoothing_factor=0.0,
207
- learning_rate=1e-06,
208
- length_column_name=length,
209
- load_best_model_at_end=False,
210
- local_rank=0,
211
- log_level=passive,
212
- log_level_replica=warning,
213
- log_on_each_node=True,
214
- logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
215
- logging_first_step=False,
216
- logging_nan_inf_filter=True,
217
- logging_steps=10,
218
- logging_strategy=steps,
219
- lr_scheduler_kwargs={},
220
- lr_scheduler_type=cosine,
221
- max_grad_norm=1.0,
222
- max_steps=-1,
223
- metric_for_best_model=None,
224
- mp_parameters=,
225
- neftune_noise_alpha=None,
226
- no_cuda=False,
227
- num_train_epochs=9.0,
228
- optim=adamw_torch,
229
- optim_args=None,
230
- output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
231
- overwrite_output_dir=True,
232
- past_index=-1,
233
- per_device_eval_batch_size=8,
234
- per_device_train_batch_size=4,
235
- predict_with_generate=False,
236
- prediction_loss_only=False,
237
- push_to_hub=False,
238
- push_to_hub_model_id=None,
239
- push_to_hub_organization=None,
240
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
241
- ray_scope=last,
242
- remove_unused_columns=True,
243
- report_to=[],
244
- resume_from_checkpoint=None,
245
- run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
246
- save_on_each_node=False,
247
- save_only_model=False,
248
- save_safetensors=True,
249
- save_steps=21,
250
- save_strategy=steps,
251
- save_total_limit=None,
252
- seed=42,
253
- skip_memory_metrics=True,
254
- sortish_sampler=False,
255
- split_batches=False,
256
- tf32=None,
257
- torch_compile=False,
258
- torch_compile_backend=None,
259
- torch_compile_mode=None,
260
- torchdynamo=None,
261
- tpu_metrics_debug=False,
262
- tpu_num_cores=None,
263
- use_cpu=False,
264
- use_ipex=False,
265
- use_legacy_prediction_loop=False,
266
- use_mps_device=False,
267
- warmup_ratio=0.0,
268
- warmup_steps=0,
269
- weight_decay=0.0,
270
- )
271
- [INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file ./tokenizer.model
272
- [INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file added_tokens.json
273
- [INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file special_tokens_map.json
274
- [INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file tokenizer_config.json
275
- [INFO|tokenization_utils_base.py:2025] 2024-02-01 14:20:24,513 >> loading file tokenizer.json
276
- [INFO|configuration_utils.py:727] 2024-02-01 14:20:24,850 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/config.json
277
- [INFO|configuration_utils.py:727] 2024-02-01 14:20:24,852 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/config.json
278
- [INFO|configuration_utils.py:792] 2024-02-01 14:20:24,854 >> Model config InternLM2Config {
279
- "_name_or_path": "/home/lirenhao/pretrained_models/internlm2-chat-7b/",
280
- "architectures": [
281
- "InternLM2ForCausalLM"
282
- ],
283
- "attn_implementation": "eager",
284
- "auto_map": {
285
- "AutoConfig": "configuration_internlm2.InternLM2Config",
286
- "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
287
- "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
288
- },
289
- "bias": false,
290
- "bos_token_id": 1,
291
- "eos_token_id": 2,
292
- "hidden_act": "silu",
293
- "hidden_size": 4096,
294
- "initializer_range": 0.02,
295
- "intermediate_size": 14336,
296
- "max_position_embeddings": 32768,
297
- "model_type": "internlm2",
298
- "num_attention_heads": 32,
299
- "num_hidden_layers": 32,
300
- "num_key_value_heads": 8,
301
- "pad_token_id": 2,
302
- "rms_norm_eps": 1e-05,
303
- "rope_scaling": {
304
- "factor": 2.0,
305
- "type": "dynamic"
306
- },
307
- "rope_theta": 1000000,
308
- "tie_word_embeddings": false,
309
- "torch_dtype": "float16",
310
- "transformers_version": "4.37.1",
311
- "use_cache": true,
312
- "vocab_size": 92544
313
- }
314
-
315
- [INFO|modeling_utils.py:3475] 2024-02-01 14:20:24,903 >> loading weights file /home/lirenhao/pretrained_models/internlm2-chat-7b/pytorch_model.bin.index.json
316
- [INFO|modeling_utils.py:1428] 2024-02-01 14:20:24,903 >> Instantiating InternLM2ForCausalLM model under default dtype torch.float16.
317
- [INFO|configuration_utils.py:826] 2024-02-01 14:20:24,905 >> Generate config GenerationConfig {
318
- "bos_token_id": 1,
319
- "eos_token_id": 2,
320
- "pad_token_id": 2
321
- }
322
-
323
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1
324
- distributed training: True, compute dtype: torch.float16
325
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
326
- _n_gpu=1,
327
- adafactor=False,
328
- adam_beta1=0.9,
329
- adam_beta2=0.999,
330
- adam_epsilon=1e-08,
331
- auto_find_batch_size=False,
332
- bf16=False,
333
- bf16_full_eval=False,
334
- data_seed=None,
335
- dataloader_drop_last=False,
336
- dataloader_num_workers=0,
337
- dataloader_persistent_workers=False,
338
- dataloader_pin_memory=True,
339
- ddp_backend=None,
340
- ddp_broadcast_buffers=None,
341
- ddp_bucket_cap_mb=None,
342
- ddp_find_unused_parameters=None,
343
- ddp_timeout=1800,
344
- debug=[],
345
- deepspeed=ds_config.json,
346
- disable_tqdm=False,
347
- dispatch_batches=None,
348
- do_eval=False,
349
- do_predict=False,
350
- do_train=True,
351
- eval_accumulation_steps=None,
352
- eval_delay=0,
353
- eval_steps=None,
354
- evaluation_strategy=no,
355
- fp16=True,
356
- fp16_backend=auto,
357
- fp16_full_eval=False,
358
- fp16_opt_level=O1,
359
- fsdp=[],
360
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
361
- fsdp_min_num_params=0,
362
- fsdp_transformer_layer_cls_to_wrap=None,
363
- full_determinism=False,
364
- generation_config=None,
365
- generation_max_length=None,
366
- generation_num_beams=None,
367
- gradient_accumulation_steps=28,
368
- gradient_checkpointing=False,
369
- gradient_checkpointing_kwargs=None,
370
- greater_is_better=None,
371
- group_by_length=False,
372
- half_precision_backend=auto,
373
- hub_always_push=False,
374
- hub_model_id=None,
375
- hub_private_repo=False,
376
- hub_strategy=every_save,
377
- hub_token=<HUB_TOKEN>,
378
- ignore_data_skip=False,
379
- include_inputs_for_metrics=False,
380
- include_num_input_tokens_seen=False,
381
- include_tokens_per_second=False,
382
- jit_mode_eval=False,
383
- label_names=None,
384
- label_smoothing_factor=0.0,
385
- learning_rate=1e-06,
386
- length_column_name=length,
387
- load_best_model_at_end=False,
388
- local_rank=1,
389
- log_level=passive,
390
- log_level_replica=warning,
391
- log_on_each_node=True,
392
- logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
393
- logging_first_step=False,
394
- logging_nan_inf_filter=True,
395
- logging_steps=10,
396
- logging_strategy=steps,
397
- lr_scheduler_kwargs={},
398
- lr_scheduler_type=cosine,
399
- max_grad_norm=1.0,
400
- max_steps=-1,
401
- metric_for_best_model=None,
402
- mp_parameters=,
403
- neftune_noise_alpha=None,
404
- no_cuda=False,
405
- num_train_epochs=9.0,
406
- optim=adamw_torch,
407
- optim_args=None,
408
- output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
409
- overwrite_output_dir=True,
410
- past_index=-1,
411
- per_device_eval_batch_size=8,
412
- per_device_train_batch_size=4,
413
- predict_with_generate=False,
414
- prediction_loss_only=False,
415
- push_to_hub=False,
416
- push_to_hub_model_id=None,
417
- push_to_hub_organization=None,
418
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
419
- ray_scope=last,
420
- remove_unused_columns=True,
421
- report_to=[],
422
- resume_from_checkpoint=None,
423
- run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
424
- save_on_each_node=False,
425
- save_only_model=False,
426
- save_safetensors=True,
427
- save_steps=21,
428
- save_strategy=steps,
429
- save_total_limit=None,
430
- seed=42,
431
- skip_memory_metrics=True,
432
- sortish_sampler=False,
433
- split_batches=False,
434
- tf32=None,
435
- torch_compile=False,
436
- torch_compile_backend=None,
437
- torch_compile_mode=None,
438
- torchdynamo=None,
439
- tpu_metrics_debug=False,
440
- tpu_num_cores=None,
441
- use_cpu=False,
442
- use_ipex=False,
443
- use_legacy_prediction_loop=False,
444
- use_mps_device=False,
445
- warmup_ratio=0.0,
446
- warmup_steps=0,
447
- weight_decay=0.0,
448
- )
449
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1
450
- distributed training: True, compute dtype: torch.float16
451
- 02/01/2024 14:20:24 - INFO - llmtuner.hparams.parser - Training/evaluation parameters Seq2SeqTrainingArguments(
452
- _n_gpu=1,
453
- adafactor=False,
454
- adam_beta1=0.9,
455
- adam_beta2=0.999,
456
- adam_epsilon=1e-08,
457
- auto_find_batch_size=False,
458
- bf16=False,
459
- bf16_full_eval=False,
460
- data_seed=None,
461
- dataloader_drop_last=False,
462
- dataloader_num_workers=0,
463
- dataloader_persistent_workers=False,
464
- dataloader_pin_memory=True,
465
- ddp_backend=None,
466
- ddp_broadcast_buffers=None,
467
- ddp_bucket_cap_mb=None,
468
- ddp_find_unused_parameters=None,
469
- ddp_timeout=1800,
470
- debug=[],
471
- deepspeed=ds_config.json,
472
- disable_tqdm=False,
473
- dispatch_batches=None,
474
- do_eval=False,
475
- do_predict=False,
476
- do_train=True,
477
- eval_accumulation_steps=None,
478
- eval_delay=0,
479
- eval_steps=None,
480
- evaluation_strategy=no,
481
- fp16=True,
482
- fp16_backend=auto,
483
- fp16_full_eval=False,
484
- fp16_opt_level=O1,
485
- fsdp=[],
486
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
487
- fsdp_min_num_params=0,
488
- fsdp_transformer_layer_cls_to_wrap=None,
489
- full_determinism=False,
490
- generation_config=None,
491
- generation_max_length=None,
492
- generation_num_beams=None,
493
- gradient_accumulation_steps=28,
494
- gradient_checkpointing=False,
495
- gradient_checkpointing_kwargs=None,
496
- greater_is_better=None,
497
- group_by_length=False,
498
- half_precision_backend=auto,
499
- hub_always_push=False,
500
- hub_model_id=None,
501
- hub_private_repo=False,
502
- hub_strategy=every_save,
503
- hub_token=<HUB_TOKEN>,
504
- ignore_data_skip=False,
505
- include_inputs_for_metrics=False,
506
- include_num_input_tokens_seen=False,
507
- include_tokens_per_second=False,
508
- jit_mode_eval=False,
509
- label_names=None,
510
- label_smoothing_factor=0.0,
511
- learning_rate=1e-06,
512
- length_column_name=length,
513
- load_best_model_at_end=False,
514
- local_rank=3,
515
- log_level=passive,
516
- log_level_replica=warning,
517
- log_on_each_node=True,
518
- logging_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/runs/Feb01_14-20-23_siat-a100-4-02,
519
- logging_first_step=False,
520
- logging_nan_inf_filter=True,
521
- logging_steps=10,
522
- logging_strategy=steps,
523
- lr_scheduler_kwargs={},
524
- lr_scheduler_type=cosine,
525
- max_grad_norm=1.0,
526
- max_steps=-1,
527
- metric_for_best_model=None,
528
- mp_parameters=,
529
- neftune_noise_alpha=None,
530
- no_cuda=False,
531
- num_train_epochs=9.0,
532
- optim=adamw_torch,
533
- optim_args=None,
534
- output_dir=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
535
- overwrite_output_dir=True,
536
- past_index=-1,
537
- per_device_eval_batch_size=8,
538
- per_device_train_batch_size=4,
539
- predict_with_generate=False,
540
- prediction_loss_only=False,
541
- push_to_hub=False,
542
- push_to_hub_model_id=None,
543
- push_to_hub_organization=None,
544
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
545
- ray_scope=last,
546
- remove_unused_columns=True,
547
- report_to=[],
548
- resume_from_checkpoint=None,
549
- run_name=/home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9,
550
- save_on_each_node=False,
551
- save_only_model=False,
552
- save_safetensors=True,
553
- save_steps=21,
554
- save_strategy=steps,
555
- save_total_limit=None,
556
- seed=42,
557
- skip_memory_metrics=True,
558
- sortish_sampler=False,
559
- split_batches=False,
560
- tf32=None,
561
- torch_compile=False,
562
- torch_compile_backend=None,
563
- torch_compile_mode=None,
564
- torchdynamo=None,
565
- tpu_metrics_debug=False,
566
- tpu_num_cores=None,
567
- use_cpu=False,
568
- use_ipex=False,
569
- use_legacy_prediction_loop=False,
570
- use_mps_device=False,
571
- warmup_ratio=0.0,
572
- warmup_steps=0,
573
- weight_decay=0.0,
574
- )
575
-
576
- return self.fget.__get__(instance, owner)()
577
-
578
- return self.fget.__get__(instance, owner)()
579
-
580
- return self.fget.__get__(instance, owner)()
581
-
582
- return self.fget.__get__(instance, owner)()
583
-
584
- 02/01/2024 14:20:35 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
585
- 02/01/2024 14:20:35 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
586
-
587
- 02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
588
- 02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
589
-
590
- [INFO|modeling_utils.py:4352] 2024-02-01 14:20:36,242 >> All model checkpoint weights were used when initializing InternLM2ForCausalLM.
591
-
592
- [INFO|modeling_utils.py:4360] 2024-02-01 14:20:36,242 >> All the weights of InternLM2ForCausalLM were initialized from the model checkpoint at /home/lirenhao/pretrained_models/internlm2-chat-7b/.
593
- If your task is similar to the task the model of the checkpoint was trained on, you can already use InternLM2ForCausalLM for predictions without further training.
594
- [INFO|configuration_utils.py:779] 2024-02-01 14:20:36,247 >> loading configuration file /home/lirenhao/pretrained_models/internlm2-chat-7b/generation_config.json
595
- [INFO|configuration_utils.py:826] 2024-02-01 14:20:36,248 >> Generate config GenerationConfig {
596
- "bos_token_id": 1,
597
- "eos_token_id": 2,
598
- "pad_token_id": 2
599
- }
600
-
601
- 02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
602
- 02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
603
-
604
- 02/01/2024 14:20:36 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
605
- 02/01/2024 14:20:36 - INFO - llmtuner.model.adapter - Fine-tuning method: Full
606
- 02/01/2024 14:20:47 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
607
- 02/01/2024 14:20:48 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
608
- 02/01/2024 14:20:48 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
609
- 02/01/2024 14:20:48 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
610
- 02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
611
- 02/01/2024 14:20:49 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
612
- 02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
613
- 02/01/2024 14:20:49 - INFO - llmtuner.model.loader - trainable params: 7737708544 || all params: 7737708544 || trainable%: 100.0000
614
- 02/01/2024 14:20:49 - INFO - llmtuner.data.template - Add <|im_end|> to stop words.
615
- Using custom data configuration default-7bf826ddf73c2f44
616
- Loading Dataset Infos from /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/datasets/packaged_modules/json
617
- Overwrite dataset info from restored data version if exists.
618
- Loading Dataset info from /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
619
- Found cached dataset json (/home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
620
- Loading Dataset info from /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
621
-
622
-
623
- 02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
624
- 02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
625
- 02/01/2024 14:20:53 - WARNING - llmtuner.data.utils - Checksum failed: missing SHA-1 hash value in dataset_info.json.
626
-
627
-
628
-
629
- Caching processed dataset at /home/lirenhao/.cache/huggingface/datasets/json/default-7bf826ddf73c2f44/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7cecb244118aac13.arrow
630
-
631
- input_ids:
632
- [1, 92543, 9081, 364, 2770, 657, 589, 15358, 17993, 6843, 963, 505, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 4452, 285, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 313, 505, 395, 7659, 1813, 4287, 1762, 560, 505, 8020, 684, 36956, 15358, 31288, 451, 68589, 76659, 71581, 699, 1226, 505, 6342, 442, 517, 11100, 328, 10894, 328, 454, 51978, 756, 285, 4576, 11146, 451, 60628, 60384, 60721, 62442, 60752, 313, 777, 3696, 454, 19187, 19829, 4563, 435, 410, 4287, 12032, 684, 410, 1341, 1893, 569, 6519, 454, 262, 69093, 281, 92542, 364, 92543, 1008, 364, 85064, 60703, 60353, 68856, 68306, 61860, 62703, 69516, 68765, 68984, 60362, 60353, 60376, 68678, 60427, 69944, 60355, 92542, 364, 92543, 525, 11353, 364, 73406, 68865, 68364, 69377, 60353, 86839, 70004, 68364, 69516, 69461, 71677, 68287, 60353, 69029, 68831, 68287, 60355, 68931, 69702, 75326, 71838, 60403, 61860, 62703, 77797, 68540, 60355, 364, 92543, 1008, 364, 74820, 68399, 69088, 60677, 68540, 60353, 61032, 71155, 69059, 60355, 92542, 364, 92543, 525, 11353, 364, 91781, 60353, 73161, 80540, 60415, 82098, 60355, 72010, 71404, 60353, 60403, 76153, 68912, 60381, 74112, 61076, 60504, 364, 92543, 1008, 364, 68856, 68306, 68912, 68326, 75848, 68595, 60353, 69972, 71645, 68473, 68585, 60353, 60404, 68965, 61716, 60418, 68273, 60353, 70124, 70698, 60363, 60355, 68389, 60363, 69667, 68306, 68303, 60353, 61214, 68310, 68758, 68261, 70623, 60355, 92542, 364, 92543, 525, 11353, 364, 72010, 69030, 71711, 61076, 60504, 68522, 60353, 86004, 71645, 68629, 68804, 68592, 69095, 60504, 364, 92543, 1008, 364, 84386, 68268, 68315, 75835, 79506, 60353, 60404, 68965, 72245, 68306, 69377, 60355, 60363, 69836, 60427, 70681, 60353, 69281, 91457, 71102, 62600, 62792, 60425, 60355, 92542, 364, 92543, 525, 11353, 364, 82967, 69068, 87160, 68261, 60504, 68319, 60353, 60403, 70868, 69962, 60871, 69893, 60366, 73603, 68261, 60504, 364, 92543, 1008, 364, 68678, 70219, 92396, 84863, 73603, 68252, 71869, 76758, 60353, 68252, 78650, 68306, 71645, 60355, 60363, 70802, 68626, 71010, 73382, 69893, 60353, 60499, 69361, 61032, 68678, 72415, 60355, 92542, 364, 92543, 525, 11353, 364, 72010, 82120, 68381, 72415, 70860, 69209, 61076, 60504, 68522, 60353, 73880, 60359, 75493, 60359, 72415, 60504, 364, 92543, 1008, 364, 74212, 60353, 60363, 73408, 69836, 73880, 60381, 72415, 60355, 69097, 60353, 88720, 60382, 71343, 68254, 70861, 68892, 60459, 71356, 60586, 60355, 92542, 364, 92543, 525, 11353, 364, 68374, 69209, 73175, 68364, 77514, 61076, 60504, 68522, 60353, 68364, 70033, 60359, 69441, 60359, 68273, 60504, 364, 92543, 1008, 364, 88554, 60355, 60363, 68848, 70033, 81269, 60353, 68965, 60520, 80959, 60355, 68389, 60363, 79837, 80665, 60353, 70465, 70802, 70133, 60355, 92542, 364, 92543, 525, 11353, 364, 76273, 68374, 60353, 69060, 71958, 60353, 68364, 69516, 70848, 69715, 60354, 60355, 81425, 68831, 68637, 60353, 80698, 74131, 73382, 79309, 60355, 364, 92543, 1008, 364, 68369, 61076, 60504, 60462, 69735, 91900, 60827, 60504, 92542, 364, 92543, 525, 11353, 364, 68400, 60353, 69897, 68505, 68364, 69209, 60353, 68908, 69116, 60381, 82567, 69290, 60355, 68265, 60353, 73161, 72826, 68288, 69418, 68304, 68747, 60353, 69068, 60381, 71645, 60359, 68303, 82409, 68615, 69715, 60355, 364, 92543, 1008, 364, 82700, 69460, 70417, 60355, 86492, 60577, 69353, 68301, 60827, 60504, 92542, 364, 92543, 525, 11353, 364, 60577, 68505, 68364, 69209, 68301, 60355, 75630, 82261, 68300, 60353, 68848, 68427, 69836, 73880, 68319, 72415, 68269, 60353, 69095, 68540, 74465, 60504, 2]
633
- inputs:
634
- <s> <|im_start|> system
635
- You are an AI assistant whose name is InternLM (书生·浦语).
636
- - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
637
- - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.<|im_end|>
638
- <|im_start|> user
639
- 心理咨询师,我觉得我的胸闷症状越来越严重了,这让我很害怕。<|im_end|>
640
- <|im_start|> assistant
641
- 我能理解你的感受,首先我们要明确你的症状并不是生理问题,而是心理问题。我们可以尝试找出引发你胸闷的心理原因。
642
- <|im_start|> user
643
- 可是我一直都在找原因,却找不到答案。<|im_end|>
644
- <|im_start|> assistant
645
- 不要着急,我们会一步一步地解决这个问题。你能告诉我,你生活中的压力和困扰吗?
646
- <|im_start|> user
647
- 我觉得我的压力主要来自于家庭,我和丈夫关系不好,他总是忙于工作,很少关心我。而且我担心我的孩子,怕他们出了什么意外。<|im_end|>
648
- <|im_start|> assistant
649
- 你能详细说说吗?比如,你和丈夫之间的问题具体是什么?
650
- <|im_start|> user
651
- 我们经常因为一些小事争吵,他总是忽略我的感受。我感到很孤独,就像被困在一个牢笼里。<|im_end|>
652
- <|im_start|> assistant
653
- 这种感觉让你想起了什么?或者,你觉得自己在这段婚姻中失去了什么?
654
- <|im_start|> user
655
- 让我想想……我觉得我失去了一个温馨的家,一个关��我的丈夫。我一直在努力维持这段婚姻,但现实却让我失望。<|im_end|>
656
- <|im_start|> assistant
657
- 你能体会到这种失望带来的情绪吗?比如,伤心、愤怒、失望?
658
- <|im_start|> user
659
- 是的,我经常会感到伤心和失望。有时候,我甚至会怀疑自己的人生是不是选错了路。<|im_end|>
660
- <|im_start|> assistant
661
- 这些情绪会影响你的日常生活吗?比如,你的睡眠、饮食、工作?
662
- <|im_start|> user
663
- 肯定的。我最近睡眠很差,总是做噩梦。而且我吃得也不好,体重一直在下降。<|im_end|>
664
- <|im_start|> assistant
665
- 了解到这些,我想告诉你,你的症状是可以改善的。我们可以通过心理治疗,帮助你走出这段困境。
666
- <|im_start|> user
667
- 真的吗?那我要如何做呢?<|im_end|>
668
- <|im_start|> assistant
669
- 首先,我们要了解你的情绪,学会面对和接纳它们。然后,我们会教你怎么表达自己的需求,让你和丈夫、孩子之间的关系得到改善。
670
- <|im_start|> user
671
- 听起来很有道理。那我们从哪里开始呢?<|im_end|>
672
- <|im_start|> assistant
673
- 从了解你的情绪开始。试着回想一下,最近一次感到伤心或者失望的时候,是什么原因导致的?</s>
674
- label_ids:
675
- [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 73406, 68865, 68364, 69377, 60353, 86839, 70004, 68364, 69516, 69461, 71677, 68287, 60353, 69029, 68831, 68287, 60355, 68931, 69702, 75326, 71838, 60403, 61860, 62703, 77797, 68540, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 91781, 60353, 73161, 80540, 60415, 82098, 60355, 72010, 71404, 60353, 60403, 76153, 68912, 60381, 74112, 61076, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 72010, 69030, 71711, 61076, 60504, 68522, 60353, 86004, 71645, 68629, 68804, 68592, 69095, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 82967, 69068, 87160, 68261, 60504, 68319, 60353, 60403, 70868, 69962, 60871, 69893, 60366, 73603, 68261, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 72010, 82120, 68381, 72415, 70860, 69209, 61076, 60504, 68522, 60353, 73880, 60359, 75493, 60359, 72415, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 68374, 69209, 73175, 68364, 77514, 61076, 60504, 68522, 60353, 68364, 70033, 60359, 69441, 60359, 68273, 60504, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 76273, 68374, 60353, 69060, 71958, 60353, 68364, 69516, 70848, 69715, 60354, 60355, 81425, 68831, 68637, 60353, 80698, 74131, 73382, 79309, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 68400, 60353, 69897, 68505, 68364, 69209, 60353, 68908, 69116, 60381, 82567, 69290, 60355, 68265, 60353, 73161, 72826, 68288, 69418, 68304, 68747, 60353, 69068, 60381, 71645, 60359, 68303, 82409, 68615, 69715, 60355, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 60577, 68505, 68364, 69209, 68301, 60355, 75630, 82261, 68300, 60353, 68848, 68427, 69836, 73880, 68319, 72415, 68269, 60353, 69095, 68540, 74465, 60504, 2]
676
- labels:
677
- 我能理解你的感受,首先我们要明确你的症状并不是生理问题,而是心理问题。我们可以尝试找出引发你胸闷的心理原因。</s> 不要着急,我们会一步一步地解决这个问题。你能告诉我,你生活中的压力和困扰吗?</s> 你能详细说说吗?比如,你和丈夫之间的问题具体是什么?</s> 这种感觉让你想起了什么?或者,你觉得自己在这段婚姻中失去了��么?</s> 你能体会到这种失望带来的情绪吗?比如,伤心、愤怒、失望?</s> 这些情绪会影响你的日常生活吗?比如,你的睡眠、饮食、工作?</s> 了解到这些,我想告诉你,你的症状是可以改善的。我们可以通过心理治疗,帮助你走出这段困境。</s> 首先,我们要了解你的情绪,学会面对和接纳它们。然后,我们会教你怎么表达自己的需求,让你和丈夫、孩子之间的关系得到改善。</s> 从了解你的情绪开始。试着回想一下,最近一次感到伤心或者失望的时候,是什么原因导致的?</s>
678
- [INFO|training_args.py:1828] 2024-02-01 14:21:08,098 >> PyTorch: setting up devices
679
-
680
- warnings.warn(
681
- Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
682
- [INFO|trainer.py:571] 2024-02-01 14:21:08,153 >> Using auto half precision backend
683
- [2024-02-01 14:21:08,351] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.1, git-hash=unknown, git-branch=unknown
684
-
685
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
686
- warnings.warn(
687
-
688
-
689
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
690
- warnings.warn(
691
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.
692
- warnings.warn(
693
- [2024-02-01 14:21:41,776] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
694
- [2024-02-01 14:21:41,778] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
695
- [2024-02-01 14:21:41,778] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
696
- [2024-02-01 14:21:41,794] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
697
- [2024-02-01 14:21:41,794] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'torch.optim.adamw.AdamW'>
698
- [2024-02-01 14:21:41,794] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer
699
- [2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:143:__init__] Reduce bucket size 500000000
700
- [2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:144:__init__] Allgather bucket size 500000000
701
- [2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:145:__init__] CPU Offload: False
702
- [2024-02-01 14:21:41,795] [INFO] [stage_1_and_2.py:146:__init__] Round robin gradient partitioning: False
703
- [2024-02-01 14:22:01,253] [INFO] [utils.py:791:see_memory_usage] Before initializing optimizer states
704
- [2024-02-01 14:22:01,254] [INFO] [utils.py:792:see_memory_usage] MA 22.12 GB Max_MA 25.72 GB CA 25.85 GB Max_CA 26 GB
705
- [2024-02-01 14:22:01,254] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 119.45 GB, percent = 12.4%
706
- [2024-02-01 14:22:01,614] [INFO] [utils.py:791:see_memory_usage] After initializing optimizer states
707
- [2024-02-01 14:22:01,615] [INFO] [utils.py:792:see_memory_usage] MA 36.53 GB Max_MA 50.95 GB CA 54.68 GB Max_CA 55 GB
708
- [2024-02-01 14:22:01,615] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 109.91 GB, percent = 11.4%
709
- [2024-02-01 14:22:01,615] [INFO] [stage_1_and_2.py:533:__init__] optimizer state initialized
710
- [2024-02-01 14:22:01,876] [INFO] [utils.py:791:see_memory_usage] After initializing ZeRO optimizer
711
- [2024-02-01 14:22:01,877] [INFO] [utils.py:792:see_memory_usage] MA 36.53 GB Max_MA 36.53 GB CA 54.68 GB Max_CA 55 GB
712
- [2024-02-01 14:22:01,878] [INFO] [utils.py:799:see_memory_usage] CPU Virtual Memory: used = 101.72 GB, percent = 10.5%
713
- [2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
714
- [2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
715
- [2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
716
- [2024-02-01 14:22:01,881] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-06], mom=[(0.9, 0.999)]
717
- [2024-02-01 14:22:01,883] [INFO] [config.py:984:print] DeepSpeedEngine configuration:
718
- [2024-02-01 14:22:01,883] [INFO] [config.py:988:print] activation_checkpointing_config {
719
- "partition_activations": false,
720
- "contiguous_memory_optimization": false,
721
- "cpu_checkpointing": false,
722
- "number_checkpoints": null,
723
- "synchronize_checkpoint_boundary": false,
724
- "profile": false
725
- }
726
- [2024-02-01 14:22:01,883] [INFO] [config.py:988:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
727
- [2024-02-01 14:22:01,883] [INFO] [config.py:988:print] amp_enabled .................. False
728
- [2024-02-01 14:22:01,883] [INFO] [config.py:988:print] amp_params ................... False
729
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] autotuning_config ............ {
730
- "enabled": false,
731
- "start_step": null,
732
- "end_step": null,
733
- "metric_path": null,
734
- "arg_mappings": null,
735
- "metric": "throughput",
736
- "model_info": null,
737
- "results_dir": "autotuning_results",
738
- "exps_dir": "autotuning_exps",
739
- "overwrite": true,
740
- "fast": true,
741
- "start_profile_step": 3,
742
- "end_profile_step": 5,
743
- "tuner_type": "gridsearch",
744
- "tuner_early_stopping": 5,
745
- "tuner_num_trials": 50,
746
- "model_info_path": null,
747
- "mp_size": 1,
748
- "max_train_batch_size": null,
749
- "min_train_batch_size": 1,
750
- "max_train_micro_batch_size_per_gpu": 1.024000e+03,
751
- "min_train_micro_batch_size_per_gpu": 1,
752
- "num_tuning_micro_batch_sizes": 3
753
- }
754
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] bfloat16_enabled ............. False
755
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] checkpoint_parallel_write_pipeline False
756
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] checkpoint_tag_validation_enabled True
757
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] checkpoint_tag_validation_fail False
758
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f7f6152d840>
759
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] communication_data_type ...... None
760
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
761
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] curriculum_enabled_legacy .... False
762
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] curriculum_params_legacy ..... False
763
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
764
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] data_efficiency_enabled ...... False
765
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] dataloader_drop_last ......... False
766
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] disable_allgather ............ False
767
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] dump_state ................... False
768
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1}
769
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_enabled ........... False
770
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_gas_boundary_resolution 1
771
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_layer_name ........ bert.encoder.layer
772
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_layer_num ......... 0
773
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_max_iter .......... 100
774
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_stability ......... 1e-06
775
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_tol ............... 0.01
776
- [2024-02-01 14:22:01,884] [INFO] [config.py:988:print] eigenvalue_verbose ........... False
777
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] elasticity_enabled ........... False
778
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] flops_profiler_config ........ {
779
- "enabled": false,
780
- "recompute_fwd_factor": 0.0,
781
- "profile_step": 1,
782
- "module_depth": -1,
783
- "top_modules": 1,
784
- "detailed": true,
785
- "output_file": null
786
- }
787
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] fp16_auto_cast ............... False
788
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] fp16_enabled ................. True
789
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] fp16_master_weights_and_gradients False
790
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] global_rank .................. 0
791
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] grad_accum_dtype ............. None
792
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] gradient_accumulation_steps .. 28
793
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] gradient_clipping ............ 1.0
794
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] gradient_predivide_factor .... 1.0
795
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] graph_harvesting ............. False
796
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
797
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] initial_dynamic_scale ........ 65536
798
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] load_universal_checkpoint .... False
799
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] loss_scale ................... 0
800
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] memory_breakdown ............. False
801
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] mics_hierarchial_params_gather False
802
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] mics_shard_size .............. -1
803
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
804
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] nebula_config ................ {
805
- "enabled": false,
806
- "persistent_storage_path": null,
807
- "persistent_time_interval": 100,
808
- "num_of_version_in_retention": 2,
809
- "enable_nebula_load": true,
810
- "load_path": null
811
- }
812
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] optimizer_legacy_fusion ...... False
813
- [2024-02-01 14:22:01,885] [INFO] [config.py:988:print] optimizer_name ............... None
814
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] optimizer_params ............. None
815
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
816
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] pld_enabled .................. False
817
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] pld_params ................... False
818
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] prescale_gradients ........... False
819
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] scheduler_name ............... None
820
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] scheduler_params ............. None
821
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] seq_parallel_communication_data_type torch.float32
822
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] sparse_attention ............. None
823
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] sparse_gradients_enabled ..... False
824
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] steps_per_print .............. inf
825
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] train_batch_size ............. 448
826
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] train_micro_batch_size_per_gpu 4
827
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] use_data_before_expert_parallel_ False
828
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] use_node_local_storage ....... False
829
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] wall_clock_breakdown ......... False
830
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] weight_quantization_config ... None
831
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] world_size ................... 4
832
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] zero_allow_untested_optimizer True
833
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
834
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] zero_enabled ................. True
835
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] zero_force_ds_cpu_optimizer .. True
836
- [2024-02-01 14:22:01,886] [INFO] [config.py:988:print] zero_optimization_stage ...... 2
837
- [2024-02-01 14:22:01,887] [INFO] [config.py:974:print_user_config] json = {
838
- "train_batch_size": 448,
839
- "train_micro_batch_size_per_gpu": 4,
840
- "gradient_accumulation_steps": 28,
841
- "gradient_clipping": 1.0,
842
- "zero_allow_untested_optimizer": true,
843
- "fp16": {
844
- "enabled": true,
845
- "loss_scale": 0,
846
- "initial_scale_power": 16,
847
- "loss_scale_window": 1000,
848
- "hysteresis": 2,
849
- "min_loss_scale": 1
850
- },
851
- "zero_optimization": {
852
- "stage": 2,
853
- "allgather_partitions": true,
854
- "allgather_bucket_size": 5.000000e+08,
855
- "reduce_scatter": true,
856
- "reduce_bucket_size": 5.000000e+08,
857
- "overlap_comm": false,
858
- "contiguous_gradients": true
859
- },
860
- "steps_per_print": inf,
861
- "bf16": {
862
- "enabled": false
863
- }
864
- }
865
- [INFO|trainer.py:1721] 2024-02-01 14:22:01,887 >> ***** Running training *****
866
- [INFO|trainer.py:1722] 2024-02-01 14:22:01,887 >> Num examples = 3,134
867
- [INFO|trainer.py:1723] 2024-02-01 14:22:01,887 >> Num Epochs = 9
868
- [INFO|trainer.py:1724] 2024-02-01 14:22:01,887 >> Instantaneous batch size per device = 4
869
- [INFO|trainer.py:1727] 2024-02-01 14:22:01,887 >> Total train batch size (w. parallel, distributed & accumulation) = 448
870
- [INFO|trainer.py:1728] 2024-02-01 14:22:01,887 >> Gradient Accumulation steps = 28
871
- [INFO|trainer.py:1729] 2024-02-01 14:22:01,887 >> Total optimization steps = 63
872
- [INFO|trainer.py:1730] 2024-02-01 14:22:01,889 >> Number of trainable parameters = 7,737,708,544
873
-
874
  0%| | 0/63 [00:00<?, ?it/s]/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
875
- warnings.warn(
876
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
877
- warnings.warn(
878
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
879
- warnings.warn(
880
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
881
- warnings.warn(
882
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
883
- overflow_gpu = get_accelerator().ByteTensor([overflow])
884
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
885
- overflow_gpu = get_accelerator().ByteTensor([overflow])
886
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
887
- overflow_gpu = get_accelerator().ByteTensor([overflow])
888
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py:1968: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)
889
- overflow_gpu = get_accelerator().ByteTensor([overflow])
890
-
891
  2%|▏ | 1/63 [00:44<45:32, 44.06s/it]
892
  3%|▎ | 2/63 [01:23<42:01, 41.33s/it]
893
  5%|▍ | 3/63 [02:04<41:04, 41.08s/it]
894
  6%|▋ | 4/63 [02:43<39:37, 40.30s/it]
895
  8%|▊ | 5/63 [03:22<38:23, 39.72s/it][2024-02-01 14:26:04,941] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
896
-
897
  10%|▉ | 6/63 [04:03<38:08, 40.15s/it][2024-02-01 14:26:44,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
898
-
899
  11%|█ | 7/63 [04:42<37:17, 39.96s/it]
900
  13%|█▎ | 8/63 [05:21<36:23, 39.71s/it]
901
  14%|█▍ | 9/63 [06:01<35:39, 39.62s/it]
902
  16%|█▌ | 10/63 [06:41<35:16, 39.93s/it]
903
 
904
-
905
  16%|█▌ | 10/63 [06:41<35:16, 39.93s/it]
906
  17%|█▋ | 11/63 [07:20<34:22, 39.65s/it]
907
  19%|█▉ | 12/63 [08:00<33:47, 39.76s/it]
908
  21%|██ | 13/63 [08:39<32:56, 39.53s/it]
909
  22%|██▏ | 14/63 [09:20<32:32, 39.85s/it]
910
  24%|██▍ | 15/63 [09:59<31:45, 39.69s/it]
911
  25%|██▌ | 16/63 [10:38<30:47, 39.31s/it]
912
  27%|██▋ | 17/63 [11:19<30:31, 39.82s/it]
913
  29%|██▊ | 18/63 [11:58<29:51, 39.81s/it]
914
  30%|███ | 19/63 [12:39<29:15, 39.89s/it]
915
  32%|███▏ | 20/63 [13:19<28:42, 40.06s/it]
916
 
917
-
918
  32%|███▏ | 20/63 [13:19<28:42, 40.06s/it]
919
  33%|███▎ | 21/63 [13:59<27:59, 39.99s/it][INFO|trainer.py:2926] 2024-02-01 14:36:12,897 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21
920
- [INFO|configuration_utils.py:473] 2024-02-01 14:36:12,902 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/config.json
921
- [INFO|configuration_utils.py:594] 2024-02-01 14:36:12,903 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/generation_config.json
922
- [INFO|modeling_utils.py:2503] 2024-02-01 14:36:40,422 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/model.safetensors.index.json.
923
- [INFO|tokenization_utils_base.py:2433] 2024-02-01 14:36:40,424 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/tokenizer_config.json
924
- [INFO|tokenization_utils_base.py:2442] 2024-02-01 14:36:40,424 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/special_tokens_map.json
925
- [2024-02-01 14:36:41,670] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step21 is about to be saved!
926
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
927
- warnings.warn(
928
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
929
- warnings.warn(
930
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
931
- warnings.warn(
932
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
933
- warnings.warn(
934
- [2024-02-01 14:36:41,683] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt
935
- [2024-02-01 14:36:41,684] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt...
936
- [2024-02-01 14:37:17,058] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/mp_rank_00_model_states.pt.
937
- [2024-02-01 14:37:17,061] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt...
938
- [2024-02-01 14:38:15,362] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt.
939
- [2024-02-01 14:38:15,363] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21/global_step21/zero_pp_rank_0_mp_rank_00_optim_states.pt
940
- [2024-02-01 14:38:15,363] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step21 is ready now!
941
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
942
- warnings.warn(
943
-
944
  35%|███▍ | 22/63 [16:52<54:35, 79.88s/it]
945
  37%|███▋ | 23/63 [17:32<45:23, 68.08s/it]
946
  38%|███▊ | 24/63 [18:15<39:17, 60.44s/it]
947
  40%|███▉ | 25/63 [18:54<34:12, 54.02s/it]
948
  41%|████▏ | 26/63 [19:33<30:33, 49.54s/it]
949
  43%|████▎ | 27/63 [20:12<27:49, 46.38s/it]
950
  44%|████▍ | 28/63 [20:51<25:45, 44.17s/it]
951
  46%|████▌ | 29/63 [21:31<24:19, 42.92s/it]
952
  48%|████▊ | 30/63 [22:11<23:07, 42.06s/it]
953
 
954
-
955
  48%|████▊ | 30/63 [22:11<23:07, 42.06s/it]
956
  49%|████▉ | 31/63 [22:52<22:17, 41.80s/it]
957
  51%|█████ | 32/63 [23:32<21:11, 41.02s/it]
958
  52%|█████▏ | 33/63 [24:10<20:06, 40.20s/it]
959
  54%|█████▍ | 34/63 [24:49<19:18, 39.96s/it]
960
  56%|█████▌ | 35/63 [25:30<18:43, 40.11s/it]
961
  57%|█████▋ | 36/63 [26:10<18:03, 40.13s/it]
962
  59%|█████▊ | 37/63 [26:49<17:12, 39.70s/it]
963
  60%|██████ | 38/63 [27:29<16:36, 39.88s/it]
964
  62%|██████▏ | 39/63 [28:08<15:48, 39.51s/it]
965
  63%|██████▎ | 40/63 [28:46<15:04, 39.34s/it]
966
 
967
-
968
  63%|██████▎ | 40/63 [28:46<15:04, 39.34s/it]
969
  65%|██████▌ | 41/63 [29:27<14:36, 39.84s/it]
970
  67%|██████▋ | 42/63 [30:07<13:57, 39.87s/it][INFO|trainer.py:2926] 2024-02-01 14:52:21,426 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42
971
- [INFO|configuration_utils.py:473] 2024-02-01 14:52:21,431 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/config.json
972
- [INFO|configuration_utils.py:594] 2024-02-01 14:52:21,432 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/generation_config.json
973
- [INFO|modeling_utils.py:2503] 2024-02-01 14:52:48,702 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/model.safetensors.index.json.
974
- [INFO|tokenization_utils_base.py:2433] 2024-02-01 14:52:48,704 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/tokenizer_config.json
975
- [INFO|tokenization_utils_base.py:2442] 2024-02-01 14:52:48,704 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/special_tokens_map.json
976
- [2024-02-01 14:52:49,843] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step42 is about to be saved!
977
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
978
- warnings.warn(
979
- [2024-02-01 14:52:49,856] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt
980
- [2024-02-01 14:52:49,856] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt...
981
- [2024-02-01 14:53:25,041] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/mp_rank_00_model_states.pt.
982
- [2024-02-01 14:53:25,044] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt...
983
- [2024-02-01 14:54:24,364] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt.
984
- [2024-02-01 14:54:24,364] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42/global_step42/zero_pp_rank_0_mp_rank_00_optim_states.pt
985
- [2024-02-01 14:54:24,364] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step42 is ready now!
986
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
987
- warnings.warn(
988
-
989
  68%|██████▊ | 43/63 [33:01<26:37, 79.86s/it]
990
  70%|██████▉ | 44/63 [33:41<21:31, 67.96s/it]
991
  71%|███████▏ | 45/63 [34:20<17:47, 59.29s/it]
992
  73%|███████▎ | 46/63 [35:01<15:13, 53.75s/it]
993
  75%|███████▍ | 47/63 [35:41<13:13, 49.58s/it]
994
  76%|███████▌ | 48/63 [36:21<11:40, 46.71s/it]
995
  78%|███████▊ | 49/63 [37:00<10:25, 44.69s/it]
996
  79%|███████▉ | 50/63 [37:42<09:26, 43.60s/it]
997
 
998
-
999
  79%|███████▉ | 50/63 [37:42<09:26, 43.60s/it]
1000
  81%|████████ | 51/63 [38:20<08:24, 42.07s/it]
1001
  83%|████████▎ | 52/63 [39:00<07:34, 41.29s/it]
1002
  84%|████████▍ | 53/63 [39:41<06:52, 41.22s/it]
1003
  86%|████████▌ | 54/63 [40:21<06:07, 40.87s/it]
1004
  87%|████████▋ | 55/63 [41:00<05:22, 40.28s/it]
1005
  89%|████████▉ | 56/63 [41:38<04:39, 39.88s/it]
1006
  90%|█████████ | 57/63 [42:18<03:58, 39.78s/it]
1007
  92%|█████████▏| 58/63 [42:59<03:20, 40.10s/it]
1008
  94%|█████████▎| 59/63 [43:39<02:40, 40.24s/it]
1009
  95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
1010
 
1011
-
1012
  95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
1013
  97%|█████████▋| 61/63 [44:58<01:19, 39.67s/it]
1014
  98%|█████████▊| 62/63 [45:38<00:39, 39.80s/it]
1015
- [INFO|configuration_utils.py:473] 2024-02-01 15:08:30,328 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/config.json
1016
- [INFO|configuration_utils.py:594] 2024-02-01 15:08:30,329 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/generation_config.json
1017
- [INFO|modeling_utils.py:2503] 2024-02-01 15:08:57,391 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/model.safetensors.index.json.
1018
- [INFO|tokenization_utils_base.py:2433] 2024-02-01 15:08:57,393 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/tokenizer_config.json
1019
- [INFO|tokenization_utils_base.py:2442] 2024-02-01 15:08:57,393 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/special_tokens_map.json
1020
- [2024-02-01 15:08:58,595] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step63 is about to be saved!
1021
- /home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
1022
- warnings.warn(
1023
- [2024-02-01 15:08:58,608] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt
1024
- [2024-02-01 15:08:58,608] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt...
1025
- [2024-02-01 15:09:33,948] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/mp_rank_00_model_states.pt.
1026
- [2024-02-01 15:09:33,951] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt...
1027
- [2024-02-01 15:10:31,865] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt.
1028
- [2024-02-01 15:10:31,866] [INFO] [engine.py:3477:_save_zero_checkpoint] zero checkpoint saved /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-63/global_step63/zero_pp_rank_0_mp_rank_00_optim_states.pt
1029
- [2024-02-01 15:10:31,866] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step63 is ready now!
1030
- [INFO|trainer.py:1962] 2024-02-01 15:10:32,863 >>
1031
-
1032
- Training completed. Do not forget to share your model on huggingface.co/models =)
1033
-
1034
-
1035
-
1036
 
1037
-
1038
- [INFO|trainer.py:2926] 2024-02-01 15:10:44,639 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9
1039
- [INFO|configuration_utils.py:473] 2024-02-01 15:10:44,787 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/config.json
1040
- [INFO|configuration_utils.py:594] 2024-02-01 15:10:44,788 >> Configuration saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/generation_config.json
1041
- [2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771596 exits successfully.
1042
- [2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771597 exits successfully.
1043
- [2024-02-01 15:10:49,742] [INFO] [launch.py:347:main] Process 3771598 exits successfully.
1044
- [INFO|modeling_utils.py:2503] 2024-02-01 15:11:12,707 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/model.safetensors.index.json.
1045
- [INFO|tokenization_utils_base.py:2433] 2024-02-01 15:11:12,709 >> tokenizer config file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tokenizer_config.json
1046
- [INFO|tokenization_utils_base.py:2442] 2024-02-01 15:11:12,709 >> Special tokens file saved in /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/special_tokens_map.json
1047
- ***** train metrics *****
1048
- epoch = 9.0
1049
- train_loss = 1.4982
1050
- train_runtime = 0:48:30.97
1051
- train_samples_per_second = 9.69
1052
- train_steps_per_second = 0.022
1053
- Figure saved: /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/training_loss.png
1054
- 02/01/2024 15:11:14 - WARNING - llmtuner.extras.ploting - No metric eval_loss to plot.
1055
- [INFO|modelcard.py:452] 2024-02-01 15:11:14,095 >> Dropping the following result as it does not have all the necessary fields:
1056
- {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
1057
- [2024-02-01 15:11:17,773] [INFO] [launch.py:347:main] Process 3771595 exits successfully.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/63 [00:00<?, ?it/s]/home/lirenhao/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  2%|▏ | 1/63 [00:44<45:32, 44.06s/it]
2
  3%|▎ | 2/63 [01:23<42:01, 41.33s/it]
3
  5%|▍ | 3/63 [02:04<41:04, 41.08s/it]
4
  6%|▋ | 4/63 [02:43<39:37, 40.30s/it]
5
  8%|▊ | 5/63 [03:22<38:23, 39.72s/it][2024-02-01 14:26:04,941] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
 
6
  10%|▉ | 6/63 [04:03<38:08, 40.15s/it][2024-02-01 14:26:44,502] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
 
7
  11%|█ | 7/63 [04:42<37:17, 39.96s/it]
8
  13%|█▎ | 8/63 [05:21<36:23, 39.71s/it]
9
  14%|█▍ | 9/63 [06:01<35:39, 39.62s/it]
10
  16%|█▌ | 10/63 [06:41<35:16, 39.93s/it]
11
 
 
12
  16%|█▌ | 10/63 [06:41<35:16, 39.93s/it]
13
  17%|█▋ | 11/63 [07:20<34:22, 39.65s/it]
14
  19%|█▉ | 12/63 [08:00<33:47, 39.76s/it]
15
  21%|██ | 13/63 [08:39<32:56, 39.53s/it]
16
  22%|██▏ | 14/63 [09:20<32:32, 39.85s/it]
17
  24%|██▍ | 15/63 [09:59<31:45, 39.69s/it]
18
  25%|██▌ | 16/63 [10:38<30:47, 39.31s/it]
19
  27%|██▋ | 17/63 [11:19<30:31, 39.82s/it]
20
  29%|██▊ | 18/63 [11:58<29:51, 39.81s/it]
21
  30%|███ | 19/63 [12:39<29:15, 39.89s/it]
22
  32%|███▏ | 20/63 [13:19<28:42, 40.06s/it]
23
 
 
24
  32%|███▏ | 20/63 [13:19<28:42, 40.06s/it]
25
  33%|███▎ | 21/63 [13:59<27:59, 39.99s/it][INFO|trainer.py:2926] 2024-02-01 14:36:12,897 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  35%|███▍ | 22/63 [16:52<54:35, 79.88s/it]
27
  37%|███▋ | 23/63 [17:32<45:23, 68.08s/it]
28
  38%|███▊ | 24/63 [18:15<39:17, 60.44s/it]
29
  40%|███▉ | 25/63 [18:54<34:12, 54.02s/it]
30
  41%|████▏ | 26/63 [19:33<30:33, 49.54s/it]
31
  43%|████▎ | 27/63 [20:12<27:49, 46.38s/it]
32
  44%|████▍ | 28/63 [20:51<25:45, 44.17s/it]
33
  46%|████▌ | 29/63 [21:31<24:19, 42.92s/it]
34
  48%|████▊ | 30/63 [22:11<23:07, 42.06s/it]
35
 
 
36
  48%|████▊ | 30/63 [22:11<23:07, 42.06s/it]
37
  49%|████▉ | 31/63 [22:52<22:17, 41.80s/it]
38
  51%|█████ | 32/63 [23:32<21:11, 41.02s/it]
39
  52%|█████▏ | 33/63 [24:10<20:06, 40.20s/it]
40
  54%|█████▍ | 34/63 [24:49<19:18, 39.96s/it]
41
  56%|█████▌ | 35/63 [25:30<18:43, 40.11s/it]
42
  57%|█████▋ | 36/63 [26:10<18:03, 40.13s/it]
43
  59%|█████▊ | 37/63 [26:49<17:12, 39.70s/it]
44
  60%|██████ | 38/63 [27:29<16:36, 39.88s/it]
45
  62%|██████▏ | 39/63 [28:08<15:48, 39.51s/it]
46
  63%|██████▎ | 40/63 [28:46<15:04, 39.34s/it]
47
 
 
48
  63%|██████▎ | 40/63 [28:46<15:04, 39.34s/it]
49
  65%|██████▌ | 41/63 [29:27<14:36, 39.84s/it]
50
  67%|██████▋ | 42/63 [30:07<13:57, 39.87s/it][INFO|trainer.py:2926] 2024-02-01 14:52:21,426 >> Saving model checkpoint to /home/lirenhao/projects/LLaMA-Factory/output/9f100e26-d997-46e8-afee-721977a16ca9/tmp-checkpoint-42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  68%|██████▊ | 43/63 [33:01<26:37, 79.86s/it]
52
  70%|██████▉ | 44/63 [33:41<21:31, 67.96s/it]
53
  71%|███████▏ | 45/63 [34:20<17:47, 59.29s/it]
54
  73%|███████▎ | 46/63 [35:01<15:13, 53.75s/it]
55
  75%|███████▍ | 47/63 [35:41<13:13, 49.58s/it]
56
  76%|███████▌ | 48/63 [36:21<11:40, 46.71s/it]
57
  78%|███████▊ | 49/63 [37:00<10:25, 44.69s/it]
58
  79%|███████▉ | 50/63 [37:42<09:26, 43.60s/it]
59
 
 
60
  79%|███████▉ | 50/63 [37:42<09:26, 43.60s/it]
61
  81%|████████ | 51/63 [38:20<08:24, 42.07s/it]
62
  83%|████████▎ | 52/63 [39:00<07:34, 41.29s/it]
63
  84%|████████▍ | 53/63 [39:41<06:52, 41.22s/it]
64
  86%|████████▌ | 54/63 [40:21<06:07, 40.87s/it]
65
  87%|████████▋ | 55/63 [41:00<05:22, 40.28s/it]
66
  89%|████████▉ | 56/63 [41:38<04:39, 39.88s/it]
67
  90%|█████████ | 57/63 [42:18<03:58, 39.78s/it]
68
  92%|█████████▏| 58/63 [42:59<03:20, 40.10s/it]
69
  94%|█████████▎| 59/63 [43:39<02:40, 40.24s/it]
70
  95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
71
 
 
72
  95%|█████████▌| 60/63 [44:19<02:00, 40.04s/it]
73
  97%|█████████▋| 61/63 [44:58<01:19, 39.67s/it]
74
  98%|█████████▊| 62/63 [45:38<00:39, 39.80s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75