breathemm commited on
Commit
cff3e35
1 Parent(s): 66750a0

Training complete

Browse files
Files changed (4) hide show
  1. all_results.json +9 -9
  2. eval_results.json +4 -4
  3. train_results.json +5 -5
  4. trainer_state.json +52 -161
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.7299028038978577,
4
- "eval_runtime": 101.4278,
5
- "eval_samples_per_second": 1.094,
6
- "eval_steps_per_second": 0.069,
7
- "total_flos": 3.744392573136077e+16,
8
- "train_loss": 0.5652910582129917,
9
- "train_runtime": 2830.2027,
10
- "train_samples_per_second": 0.353,
11
- "train_steps_per_second": 0.089
12
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.6883889436721802,
4
+ "eval_runtime": 103.9672,
5
+ "eval_samples_per_second": 1.068,
6
+ "eval_steps_per_second": 0.135,
7
+ "total_flos": 4.081496890291814e+16,
8
+ "train_loss": 0.6921328268353901,
9
+ "train_runtime": 3110.5155,
10
+ "train_samples_per_second": 0.321,
11
+ "train_steps_per_second": 0.02
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.7299028038978577,
4
- "eval_runtime": 101.4278,
5
- "eval_samples_per_second": 1.094,
6
- "eval_steps_per_second": 0.069
7
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.6883889436721802,
4
+ "eval_runtime": 103.9672,
5
+ "eval_samples_per_second": 1.068,
6
+ "eval_steps_per_second": 0.135
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 3.744392573136077e+16,
4
- "train_loss": 0.5652910582129917,
5
- "train_runtime": 2830.2027,
6
- "train_samples_per_second": 0.353,
7
- "train_steps_per_second": 0.089
8
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 4.081496890291814e+16,
4
+ "train_loss": 0.6921328268353901,
5
+ "train_runtime": 3110.5155,
6
+ "train_samples_per_second": 0.321,
7
+ "train_steps_per_second": 0.02
8
  }
trainer_state.json CHANGED
@@ -3,203 +3,94 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 252,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.12,
13
- "grad_norm": 350422.84375,
14
- "learning_rate": 0.0002988358809900258,
15
- "loss": 1.2276,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.24,
20
- "grad_norm": 127592.3203125,
21
- "learning_rate": 0.00029536159293436166,
22
- "loss": 0.7731,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.36,
27
- "grad_norm": 113769.28125,
28
- "learning_rate": 0.00028963106229663063,
29
- "loss": 0.7033,
30
- "step": 30
31
- },
32
  {
33
  "epoch": 0.48,
34
- "grad_norm": 109609.40625,
35
- "learning_rate": 0.0002817332360055343,
36
- "loss": 0.7912,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.6,
41
- "grad_norm": 130476.6171875,
42
- "learning_rate": 0.0002717907008573785,
43
- "loss": 0.7413,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.71,
48
- "grad_norm": 86021.2265625,
49
- "learning_rate": 0.0002599577807744739,
50
- "loss": 0.7233,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.83,
55
- "grad_norm": 87099.53125,
56
- "learning_rate": 0.0002464181414529809,
57
- "loss": 0.759,
58
- "step": 70
59
  },
60
  {
61
  "epoch": 0.95,
62
- "grad_norm": 79031.671875,
63
- "learning_rate": 0.0002313819395798639,
64
- "loss": 0.7152,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 1.07,
69
- "grad_norm": 96164.9609375,
70
- "learning_rate": 0.00021508256086763368,
71
- "loss": 0.6194,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 1.19,
76
- "grad_norm": 82587.0859375,
77
- "learning_rate": 0.00019777299753775265,
78
- "loss": 0.5308,
79
- "step": 100
80
  },
81
  {
82
- "epoch": 1.31,
83
- "grad_norm": 83371.2578125,
84
- "learning_rate": 0.0001797219214799096,
85
- "loss": 0.5415,
86
- "step": 110
 
87
  },
88
  {
89
  "epoch": 1.43,
90
- "grad_norm": 107494.8984375,
91
- "learning_rate": 0.00016120951403796364,
92
- "loss": 0.5953,
93
- "step": 120
94
- },
95
- {
96
- "epoch": 1.55,
97
- "grad_norm": 110181.2578125,
98
- "learning_rate": 0.0001425231171508954,
99
- "loss": 0.591,
100
- "step": 130
101
- },
102
- {
103
- "epoch": 1.67,
104
- "grad_norm": 84472.59375,
105
- "learning_rate": 0.00012395277334996044,
106
- "loss": 0.5071,
107
- "step": 140
108
- },
109
- {
110
- "epoch": 1.79,
111
- "grad_norm": 72922.546875,
112
- "learning_rate": 0.00010578672383836435,
113
- "loss": 0.5188,
114
- "step": 150
115
  },
116
  {
117
  "epoch": 1.9,
118
- "grad_norm": 84903.71875,
119
- "learning_rate": 8.830693453040829e-05,
120
- "loss": 0.5508,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 2.02,
125
- "grad_norm": 71174.5546875,
126
- "learning_rate": 7.17847194930753e-05,
127
- "loss": 0.4979,
128
- "step": 170
129
- },
130
- {
131
- "epoch": 2.14,
132
- "grad_norm": 190946.0625,
133
- "learning_rate": 5.6476529721189974e-05,
134
- "loss": 0.3451,
135
- "step": 180
136
  },
137
  {
138
- "epoch": 2.26,
139
- "grad_norm": 73154.4609375,
140
- "learning_rate": 4.261997261104223e-05,
141
- "loss": 0.3543,
142
- "step": 190
 
143
  },
144
  {
145
  "epoch": 2.38,
146
- "grad_norm": 97452.4453125,
147
- "learning_rate": 3.0430123916561672e-05,
148
- "loss": 0.3384,
149
- "step": 200
150
- },
151
- {
152
- "epoch": 2.5,
153
- "grad_norm": 75840.453125,
154
- "learning_rate": 2.009618943233419e-05,
155
- "loss": 0.3708,
156
- "step": 210
157
- },
158
- {
159
- "epoch": 2.62,
160
- "grad_norm": 101251.2578125,
161
- "learning_rate": 1.1778568219438839e-05,
162
- "loss": 0.354,
163
- "step": 220
164
- },
165
- {
166
- "epoch": 2.74,
167
- "grad_norm": 77960.1015625,
168
- "learning_rate": 5.606362957498195e-06,
169
- "loss": 0.3275,
170
- "step": 230
171
  },
172
  {
173
  "epoch": 2.86,
174
- "grad_norm": 92760.578125,
175
- "learning_rate": 1.6753760662307215e-06,
176
- "loss": 0.3273,
177
- "step": 240
178
  },
179
  {
180
- "epoch": 2.98,
181
- "grad_norm": 78068.015625,
182
- "learning_rate": 4.662269987756317e-08,
183
- "loss": 0.3638,
184
- "step": 250
 
185
  },
186
  {
187
  "epoch": 3.0,
188
- "step": 252,
189
- "total_flos": 3.744392573136077e+16,
190
- "train_loss": 0.5652910582129917,
191
- "train_runtime": 2830.2027,
192
- "train_samples_per_second": 0.353,
193
- "train_steps_per_second": 0.089
194
  }
195
  ],
196
  "logging_steps": 10,
197
- "max_steps": 252,
198
  "num_input_tokens_seen": 0,
199
  "num_train_epochs": 3,
200
  "save_steps": 10,
201
- "total_flos": 3.744392573136077e+16,
202
- "train_batch_size": 4,
203
  "trial_name": null,
204
  "trial_params": null
205
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 63,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.48,
13
+ "grad_norm": 78838.859375,
14
+ "learning_rate": 0.00018782215733702286,
15
+ "loss": 1.2119,
16
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.95,
20
+ "grad_norm": 79446.625,
21
+ "learning_rate": 0.00015425462638657595,
22
+ "loss": 0.7581,
23
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_loss": 0.7133347392082214,
28
+ "eval_runtime": 104.4914,
29
+ "eval_samples_per_second": 1.062,
30
+ "eval_steps_per_second": 0.134,
31
+ "step": 21
32
  },
33
  {
34
  "epoch": 1.43,
35
+ "grad_norm": 60001.390625,
36
+ "learning_rate": 0.00010747300935864243,
37
+ "loss": 0.6176,
38
+ "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 1.9,
42
+ "grad_norm": 59219.640625,
43
+ "learning_rate": 5.887128968693887e-05,
44
+ "loss": 0.612,
45
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
+ "epoch": 2.0,
49
+ "eval_loss": 0.680975079536438,
50
+ "eval_runtime": 104.4154,
51
+ "eval_samples_per_second": 1.063,
52
+ "eval_steps_per_second": 0.134,
53
+ "step": 42
54
  },
55
  {
56
  "epoch": 2.38,
57
+ "grad_norm": 56114.09765625,
58
+ "learning_rate": 2.0286749277707782e-05,
59
+ "loss": 0.5092,
60
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
  "epoch": 2.86,
64
+ "grad_norm": 58096.5703125,
65
+ "learning_rate": 1.1169173774871478e-06,
66
+ "loss": 0.4944,
67
+ "step": 60
68
  },
69
  {
70
+ "epoch": 3.0,
71
+ "eval_loss": 0.6883889436721802,
72
+ "eval_runtime": 104.4068,
73
+ "eval_samples_per_second": 1.063,
74
+ "eval_steps_per_second": 0.134,
75
+ "step": 63
76
  },
77
  {
78
  "epoch": 3.0,
79
+ "step": 63,
80
+ "total_flos": 4.081496890291814e+16,
81
+ "train_loss": 0.6921328268353901,
82
+ "train_runtime": 3110.5155,
83
+ "train_samples_per_second": 0.321,
84
+ "train_steps_per_second": 0.02
85
  }
86
  ],
87
  "logging_steps": 10,
88
+ "max_steps": 63,
89
  "num_input_tokens_seen": 0,
90
  "num_train_epochs": 3,
91
  "save_steps": 10,
92
+ "total_flos": 4.081496890291814e+16,
93
+ "train_batch_size": 8,
94
  "trial_name": null,
95
  "trial_params": null
96
  }