File size: 10,140 Bytes
b17ec09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
2021-05-21 21:14:28,688	INFO	__main__	Namespace(adjust_lr=False, config='torchdistill/configs/sample/glue/qqp/ce/bert_large_uncased.yaml', log='log/glue/qqp/ce/bert_large_uncased.txt', private_output='leaderboard/glue/standard/bert_large_uncased/', seed=None, student_only=False, task_name='qqp', test_only=False, world_size=1)
2021-05-21 21:14:28,730	INFO	__main__	Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: True

2021-05-21 21:16:07,804	INFO	__main__	Start training
2021-05-21 21:16:07,804	INFO	torchdistill.models.util	[student model]
2021-05-21 21:16:07,804	INFO	torchdistill.models.util	Using the original student model
2021-05-21 21:16:07,804	INFO	torchdistill.core.training	Loss = 1.0 * OrgLoss
2021-05-21 21:16:11,237	INFO	torchdistill.misc.log	Epoch: [0]  [    0/11371]  eta: 2:37:49  lr: 1.9999413713247152e-05  sample/s: 4.944792458146497  loss: 0.6525 (0.6525)  time: 0.8328  data: 0.0239  max mem: 5401
2021-05-21 21:29:10,487	INFO	torchdistill.misc.log	Epoch: [0]  [ 1000/11371]  eta: 2:14:42  lr: 1.941312696039633e-05  sample/s: 4.011099056159446  loss: 0.3216 (0.4234)  time: 0.7933  data: 0.0040  max mem: 12422
2021-05-21 21:42:14,805	INFO	torchdistill.misc.log	Epoch: [0]  [ 2000/11371]  eta: 2:02:06  lr: 1.8826840207545512e-05  sample/s: 5.070043791535248  loss: 0.2346 (0.3648)  time: 0.7847  data: 0.0039  max mem: 12422
2021-05-21 21:55:25,365	INFO	torchdistill.misc.log	Epoch: [0]  [ 3000/11371]  eta: 1:49:28  lr: 1.824055345469469e-05  sample/s: 3.7257152095252013  loss: 0.2308 (0.3350)  time: 0.8381  data: 0.0039  max mem: 12422
2021-05-21 22:08:29,622	INFO	torchdistill.misc.log	Epoch: [0]  [ 4000/11371]  eta: 1:36:23  lr: 1.7654266701843875e-05  sample/s: 6.351632958557614  loss: 0.2229 (0.3187)  time: 0.7550  data: 0.0041  max mem: 12422
2021-05-21 22:21:35,419	INFO	torchdistill.misc.log	Epoch: [0]  [ 5000/11371]  eta: 1:23:20  lr: 1.7067979948993053e-05  sample/s: 4.302527628708725  loss: 0.2084 (0.3052)  time: 0.8191  data: 0.0039  max mem: 12422
2021-05-21 22:34:36,763	INFO	torchdistill.misc.log	Epoch: [0]  [ 6000/11371]  eta: 1:10:12  lr: 1.6481693196142235e-05  sample/s: 5.6633386499236265  loss: 0.2572 (0.2948)  time: 0.8056  data: 0.0038  max mem: 12422
2021-05-21 22:47:34,791	INFO	torchdistill.misc.log	Epoch: [0]  [ 7000/11371]  eta: 0:57:04  lr: 1.5895406443291413e-05  sample/s: 5.072499497352218  loss: 0.2353 (0.2871)  time: 0.6971  data: 0.0038  max mem: 12422
2021-05-21 23:00:34,392	INFO	torchdistill.misc.log	Epoch: [0]  [ 8000/11371]  eta: 0:43:59  lr: 1.5309119690440595e-05  sample/s: 4.661598999284807  loss: 0.2822 (0.2804)  time: 0.7155  data: 0.0037  max mem: 12422
2021-05-21 23:13:36,395	INFO	torchdistill.misc.log	Epoch: [0]  [ 9000/11371]  eta: 0:30:56  lr: 1.4722832937589777e-05  sample/s: 3.0764357080524487  loss: 0.2347 (0.2753)  time: 0.8115  data: 0.0038  max mem: 12422
2021-05-21 23:26:42,471	INFO	torchdistill.misc.log	Epoch: [0]  [10000/11371]  eta: 0:17:53  lr: 1.4136546184738957e-05  sample/s: 4.663713788370137  loss: 0.2396 (0.2712)  time: 0.8491  data: 0.0038  max mem: 12422
2021-05-21 23:39:45,157	INFO	torchdistill.misc.log	Epoch: [0]  [11000/11371]  eta: 0:04:50  lr: 1.3550259431888138e-05  sample/s: 4.012062097322281  loss: 0.1471 (0.2671)  time: 0.7898  data: 0.0039  max mem: 12422
2021-05-21 23:44:36,131	INFO	torchdistill.misc.log	Epoch: [0] Total time: 2:28:25
2021-05-21 23:49:50,700	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
2021-05-21 23:49:50,702	INFO	__main__	Validation: accuracy = 0.9027702201335642, f1 = 0.8675583706748424
2021-05-21 23:49:50,702	INFO	__main__	Updating ckpt
2021-05-21 23:49:56,489	INFO	torchdistill.misc.log	Epoch: [1]  [    0/11371]  eta: 3:51:24  lr: 1.3332747046580484e-05  sample/s: 4.572169832788334  loss: 0.2259 (0.2259)  time: 1.2211  data: 0.3462  max mem: 12422
2021-05-22 00:03:04,241	INFO	torchdistill.misc.log	Epoch: [1]  [ 1000/11371]  eta: 2:16:14  lr: 1.2746460293729664e-05  sample/s: 4.007176855215162  loss: 0.0992 (0.1365)  time: 0.8028  data: 0.0039  max mem: 12422
2021-05-22 00:16:04,871	INFO	torchdistill.misc.log	Epoch: [1]  [ 2000/11371]  eta: 2:02:30  lr: 1.2160173540878845e-05  sample/s: 4.009962993707042  loss: 0.0747 (0.1359)  time: 0.7765  data: 0.0039  max mem: 12422
2021-05-22 00:29:08,342	INFO	torchdistill.misc.log	Epoch: [1]  [ 3000/11371]  eta: 1:49:23  lr: 1.1573886788028025e-05  sample/s: 5.6609041606451935  loss: 0.0760 (0.1343)  time: 0.7970  data: 0.0039  max mem: 12422
2021-05-22 00:42:10,684	INFO	torchdistill.misc.log	Epoch: [1]  [ 4000/11371]  eta: 1:36:16  lr: 1.0987600035177207e-05  sample/s: 6.341786798200569  loss: 0.1134 (0.1346)  time: 0.7267  data: 0.0039  max mem: 12422
2021-05-22 00:55:10,248	INFO	torchdistill.misc.log	Epoch: [1]  [ 5000/11371]  eta: 1:23:07  lr: 1.0401313282326387e-05  sample/s: 5.659105436374927  loss: 0.0943 (0.1348)  time: 0.7889  data: 0.0039  max mem: 12422
2021-05-22 01:08:07,447	INFO	torchdistill.misc.log	Epoch: [1]  [ 6000/11371]  eta: 1:09:59  lr: 9.815026529475568e-06  sample/s: 6.343112724248096  loss: 0.0754 (0.1327)  time: 0.7515  data: 0.0038  max mem: 12422
2021-05-22 01:21:16,896	INFO	torchdistill.misc.log	Epoch: [1]  [ 7000/11371]  eta: 0:57:02  lr: 9.228739776624748e-06  sample/s: 5.65419436940216  loss: 0.1113 (0.1310)  time: 0.7775  data: 0.0040  max mem: 12422
2021-05-22 01:34:26,830	INFO	torchdistill.misc.log	Epoch: [1]  [ 8000/11371]  eta: 0:44:02  lr: 8.642453023773928e-06  sample/s: 5.655715411651359  loss: 0.0946 (0.1307)  time: 0.7769  data: 0.0039  max mem: 12422
2021-05-22 01:47:30,469	INFO	torchdistill.misc.log	Epoch: [1]  [ 9000/11371]  eta: 0:30:58  lr: 8.05616627092311e-06  sample/s: 4.297865363806276  loss: 0.0730 (0.1305)  time: 0.8130  data: 0.0039  max mem: 12422
2021-05-22 02:00:41,929	INFO	torchdistill.misc.log	Epoch: [1]  [10000/11371]  eta: 0:17:55  lr: 7.469879518072289e-06  sample/s: 5.66297735715824  loss: 0.1144 (0.1305)  time: 0.7982  data: 0.0038  max mem: 12422
2021-05-22 02:13:41,396	INFO	torchdistill.misc.log	Epoch: [1]  [11000/11371]  eta: 0:04:50  lr: 6.883592765221471e-06  sample/s: 6.346625629944702  loss: 0.0934 (0.1305)  time: 0.7557  data: 0.0039  max mem: 12422
2021-05-22 02:18:28,506	INFO	torchdistill.misc.log	Epoch: [1] Total time: 2:28:33
2021-05-22 02:23:43,038	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
2021-05-22 02:23:43,039	INFO	__main__	Validation: accuracy = 0.9108088053425674, f1 = 0.8808564065287783
2021-05-22 02:23:43,040	INFO	__main__	Updating ckpt
2021-05-22 02:23:49,042	INFO	torchdistill.misc.log	Epoch: [2]  [    0/11371]  eta: 3:17:44  lr: 6.666080379913816e-06  sample/s: 3.9286506100722214  loss: 0.0282 (0.0282)  time: 1.0434  data: 0.0252  max mem: 12422
2021-05-22 02:36:56,578	INFO	torchdistill.misc.log	Epoch: [2]  [ 1000/11371]  eta: 2:16:10  lr: 6.079793627062997e-06  sample/s: 5.072415148418462  loss: 0.0815 (0.1079)  time: 0.7822  data: 0.0037  max mem: 12422
2021-05-22 02:50:00,419	INFO	torchdistill.misc.log	Epoch: [2]  [ 2000/11371]  eta: 2:02:43  lr: 5.493506874212178e-06  sample/s: 7.06030068068721  loss: 0.0005 (0.1391)  time: 0.7939  data: 0.0038  max mem: 12422
2021-05-22 03:03:02,755	INFO	torchdistill.misc.log	Epoch: [2]  [ 3000/11371]  eta: 1:49:28  lr: 4.9072201213613585e-06  sample/s: 5.067682304480508  loss: 0.0020 (0.1500)  time: 0.8177  data: 0.0039  max mem: 12422
2021-05-22 03:16:05,350	INFO	torchdistill.misc.log	Epoch: [2]  [ 4000/11371]  eta: 1:36:19  lr: 4.3209333685105384e-06  sample/s: 4.658021705874024  loss: 0.0001 (0.1574)  time: 0.7454  data: 0.0039  max mem: 12422
2021-05-22 03:29:12,334	INFO	torchdistill.misc.log	Epoch: [2]  [ 5000/11371]  eta: 1:23:19  lr: 3.7346466156597192e-06  sample/s: 5.075143340627668  loss: 0.0064 (0.1614)  time: 0.7441  data: 0.0038  max mem: 12422
2021-05-22 03:42:11,969	INFO	torchdistill.misc.log	Epoch: [2]  [ 6000/11371]  eta: 1:10:10  lr: 3.1483598628089e-06  sample/s: 6.343976192907029  loss: 0.0000 (0.1654)  time: 0.7401  data: 0.0039  max mem: 12422
2021-05-22 03:55:11,370	INFO	torchdistill.misc.log	Epoch: [2]  [ 7000/11371]  eta: 0:57:03  lr: 2.562073109958081e-06  sample/s: 5.667865404034177  loss: 0.3340 (0.1671)  time: 0.7892  data: 0.0038  max mem: 12422
2021-05-22 04:08:12,458	INFO	torchdistill.misc.log	Epoch: [2]  [ 8000/11371]  eta: 0:43:59  lr: 1.9757863571072612e-06  sample/s: 4.010017625015954  loss: 0.0000 (0.1712)  time: 0.7973  data: 0.0037  max mem: 12422
2021-05-22 04:21:14,312	INFO	torchdistill.misc.log	Epoch: [2]  [ 9000/11371]  eta: 0:30:56  lr: 1.3894996042564418e-06  sample/s: 5.713281224193597  loss: 0.3198 (0.1729)  time: 0.7105  data: 0.0037  max mem: 12422
2021-05-22 04:34:21,836	INFO	torchdistill.misc.log	Epoch: [2]  [10000/11371]  eta: 0:17:53  lr: 8.032128514056225e-07  sample/s: 4.659584478287122  loss: 0.2673 (0.1738)  time: 0.7881  data: 0.0038  max mem: 12422
2021-05-22 04:47:25,082	INFO	torchdistill.misc.log	Epoch: [2]  [11000/11371]  eta: 0:04:50  lr: 2.169260985548032e-07  sample/s: 5.66490287198808  loss: 0.0145 (0.1735)  time: 0.7445  data: 0.0037  max mem: 12422
2021-05-22 04:52:14,855	INFO	torchdistill.misc.log	Epoch: [2] Total time: 2:28:26
2021-05-22 04:57:29,103	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
2021-05-22 04:57:29,104	INFO	__main__	Validation: accuracy = 0.9102646549591887, f1 = 0.8795324744321955
2021-05-22 04:57:39,663	INFO	__main__	[Student: bert-large-uncased]
2021-05-22 05:02:54,796	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
2021-05-22 05:02:54,797	INFO	__main__	Test: accuracy = 0.9108088053425674, f1 = 0.8808564065287783
2021-05-22 05:02:54,797	INFO	__main__	Start prediction for private dataset(s)
2021-05-22 05:02:54,798	INFO	__main__	qqp/test: 390965 samples