srsawant34
commited on
Commit
•
493deb9
1
Parent(s):
4f1cb39
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 1_Pooling/config.json +7 -0
- README.md +107 -0
- checkpoint-6027/model.safetensors +3 -0
- checkpoint-6027/optimizer.pt +3 -0
- checkpoint-6027/rng_state.pth +3 -0
- checkpoint-6027/scheduler.pt +3 -0
- checkpoint-6027/trainer_state.json +147 -0
- checkpoint-6027/training_args.bin +3 -0
- checkpoint-6314/model.safetensors +3 -0
- checkpoint-6314/optimizer.pt +3 -0
- checkpoint-6314/rng_state.pth +3 -0
- checkpoint-6314/scheduler.pt +3 -0
- checkpoint-6314/trainer_state.json +153 -0
- checkpoint-6314/training_args.bin +3 -0
- checkpoint-6601/model.safetensors +3 -0
- checkpoint-6601/optimizer.pt +3 -0
- checkpoint-6601/rng_state.pth +3 -0
- checkpoint-6601/scheduler.pt +3 -0
- checkpoint-6601/trainer_state.json +159 -0
- checkpoint-6601/training_args.bin +3 -0
- checkpoint-6888/model.safetensors +3 -0
- checkpoint-6888/optimizer.pt +3 -0
- checkpoint-6888/rng_state.pth +3 -0
- checkpoint-6888/scheduler.pt +3 -0
- checkpoint-6888/trainer_state.json +165 -0
- checkpoint-6888/training_args.bin +3 -0
- checkpoint-7175/model.safetensors +3 -0
- checkpoint-7175/optimizer.pt +3 -0
- checkpoint-7175/rng_state.pth +3 -0
- checkpoint-7175/scheduler.pt +3 -0
- checkpoint-7175/trainer_state.json +171 -0
- checkpoint-7175/training_args.bin +3 -0
- checkpoint-7462/model.safetensors +3 -0
- checkpoint-7462/optimizer.pt +3 -0
- checkpoint-7462/rng_state.pth +3 -0
- checkpoint-7462/scheduler.pt +3 -0
- checkpoint-7462/trainer_state.json +177 -0
- checkpoint-7462/training_args.bin +3 -0
- checkpoint-7749/model.safetensors +3 -0
- checkpoint-7749/optimizer.pt +3 -0
- checkpoint-7749/rng_state.pth +3 -0
- checkpoint-7749/scheduler.pt +3 -0
- checkpoint-7749/trainer_state.json +183 -0
- checkpoint-7749/training_args.bin +3 -0
- checkpoint-8036/model.safetensors +3 -0
- checkpoint-8036/optimizer.pt +3 -0
- checkpoint-8036/rng_state.pth +3 -0
- checkpoint-8036/scheduler.pt +3 -0
- checkpoint-8036/trainer_state.json +189 -0
- checkpoint-8036/training_args.bin +3 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
README.md
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pipeline_tag: sentence-similarity
|
3 |
+
license: apache-2.0
|
4 |
+
tags:
|
5 |
+
- sentence-transformers
|
6 |
+
- feature-extraction
|
7 |
+
- sentence-similarity
|
8 |
+
- transformers
|
9 |
+
---
|
10 |
+
|
11 |
+
# sentence-transformers/paraphrase-MiniLM-L6-v2
|
12 |
+
|
13 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
## Usage (Sentence-Transformers)
|
18 |
+
|
19 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
20 |
+
|
21 |
+
```
|
22 |
+
pip install -U sentence-transformers
|
23 |
+
```
|
24 |
+
|
25 |
+
Then you can use the model like this:
|
26 |
+
|
27 |
+
```python
|
28 |
+
from sentence_transformers import SentenceTransformer
|
29 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
30 |
+
|
31 |
+
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
32 |
+
embeddings = model.encode(sentences)
|
33 |
+
print(embeddings)
|
34 |
+
```
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
## Usage (HuggingFace Transformers)
|
39 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
40 |
+
|
41 |
+
```python
|
42 |
+
from transformers import AutoTokenizer, AutoModel
|
43 |
+
import torch
|
44 |
+
|
45 |
+
|
46 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
47 |
+
def mean_pooling(model_output, attention_mask):
|
48 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
49 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
50 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
51 |
+
|
52 |
+
|
53 |
+
# Sentences we want sentence embeddings for
|
54 |
+
sentences = ['This is an example sentence', 'Each sentence is converted']
|
55 |
+
|
56 |
+
# Load model from HuggingFace Hub
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
58 |
+
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
59 |
+
|
60 |
+
# Tokenize sentences
|
61 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
62 |
+
|
63 |
+
# Compute token embeddings
|
64 |
+
with torch.no_grad():
|
65 |
+
model_output = model(**encoded_input)
|
66 |
+
|
67 |
+
# Perform pooling. In this case, max pooling.
|
68 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
69 |
+
|
70 |
+
print("Sentence embeddings:")
|
71 |
+
print(sentence_embeddings)
|
72 |
+
```
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
## Evaluation Results
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/paraphrase-MiniLM-L6-v2)
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
## Full Model Architecture
|
85 |
+
```
|
86 |
+
SentenceTransformer(
|
87 |
+
(0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
|
88 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
|
89 |
+
)
|
90 |
+
```
|
91 |
+
|
92 |
+
## Citing & Authors
|
93 |
+
|
94 |
+
This model was trained by [sentence-transformers](https://www.sbert.net/).
|
95 |
+
|
96 |
+
If you find this model helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084):
|
97 |
+
```bibtex
|
98 |
+
@inproceedings{reimers-2019-sentence-bert,
|
99 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
100 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
101 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
102 |
+
month = "11",
|
103 |
+
year = "2019",
|
104 |
+
publisher = "Association for Computational Linguistics",
|
105 |
+
url = "http://arxiv.org/abs/1908.10084",
|
106 |
+
}
|
107 |
+
```
|
checkpoint-6027/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4733fd7d4f13b14f19647c36ba4fa4454d7f3de192d83b1afb82511a84823e23
|
3 |
+
size 90866120
|
checkpoint-6027/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c886a67f3900eef2655cca574218b61d5a1cd40487a321e085fec5a326ae7540
|
3 |
+
size 180607738
|
checkpoint-6027/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c8132ebefa53f250b44c875ba8e0ff411e4d800e0e9e5925eb8ae5a49fcd489
|
3 |
+
size 14244
|
checkpoint-6027/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3fbd046324521587b4e80e111a51723c0f196dd3ff18ee5a2239809294d16ec
|
3 |
+
size 1064
|
checkpoint-6027/trainer_state.json
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 21.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6027,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
}
|
137 |
+
],
|
138 |
+
"logging_steps": 500,
|
139 |
+
"max_steps": 9184,
|
140 |
+
"num_input_tokens_seen": 0,
|
141 |
+
"num_train_epochs": 32,
|
142 |
+
"save_steps": 500,
|
143 |
+
"total_flos": 0.0,
|
144 |
+
"train_batch_size": 64,
|
145 |
+
"trial_name": null,
|
146 |
+
"trial_params": null
|
147 |
+
}
|
checkpoint-6027/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-6314/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffb9d234ce16eb62bfaa4221636dafbdd629621a3d5d6c5fd9ab84b3b0b1b1a6
|
3 |
+
size 90866120
|
checkpoint-6314/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a87cfb151fe8553d7bf1bc304afb20d87b6535e8731fa1a235ed08ffc7a2fe6
|
3 |
+
size 180607738
|
checkpoint-6314/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19e881a8dd55719ee21e204acb99a2911d67016f769fb6b3b7466fafbaf0f9cd
|
3 |
+
size 14244
|
checkpoint-6314/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:287908aafda9b69a00a1a7fc70dab854c95a31d48d4802f418545e885ada59a2
|
3 |
+
size 1064
|
checkpoint-6314/trainer_state.json
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 22.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6314,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"logging_steps": 500,
|
145 |
+
"max_steps": 9184,
|
146 |
+
"num_input_tokens_seen": 0,
|
147 |
+
"num_train_epochs": 32,
|
148 |
+
"save_steps": 500,
|
149 |
+
"total_flos": 0.0,
|
150 |
+
"train_batch_size": 64,
|
151 |
+
"trial_name": null,
|
152 |
+
"trial_params": null
|
153 |
+
}
|
checkpoint-6314/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-6601/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f34e4e0514126e422d0077461ee3fdf8da8668c43e97aaf363dc673364948c6f
|
3 |
+
size 90866120
|
checkpoint-6601/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63e359bde6424914d379e217b4ba17a6f2d39e97e20cd4836b7d77b4ca84d62a
|
3 |
+
size 180607738
|
checkpoint-6601/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d798f414d39bf019c39269ef4f56532a529aab04afb3a417a5ff16b7ed0ad786
|
3 |
+
size 14244
|
checkpoint-6601/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8c1c73cd10d4c6ed97f6679e38fdc6b436e706a28ba69046af4b534c81c02ec
|
3 |
+
size 1064
|
checkpoint-6601/trainer_state.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 23.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6601,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"logging_steps": 500,
|
151 |
+
"max_steps": 9184,
|
152 |
+
"num_input_tokens_seen": 0,
|
153 |
+
"num_train_epochs": 32,
|
154 |
+
"save_steps": 500,
|
155 |
+
"total_flos": 0.0,
|
156 |
+
"train_batch_size": 64,
|
157 |
+
"trial_name": null,
|
158 |
+
"trial_params": null
|
159 |
+
}
|
checkpoint-6601/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-6888/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ed9000929251e2f6f2419aac4b88167e4e0264b576d3bc6a2d1bb026eac5bd0
|
3 |
+
size 90866120
|
checkpoint-6888/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49e950c4c4252d5d35e86d5eb4237b5d9fe84e76c44f29148682318bcc0019b4
|
3 |
+
size 180607738
|
checkpoint-6888/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02dd0d13161623f5c7e3f1f8f092881967f367c62ffd2bdebb58fb8c30515275
|
3 |
+
size 14244
|
checkpoint-6888/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94d772f8d585712dde96e99df002ebca4da45da1695bb55aa70e11a188f02c98
|
3 |
+
size 1064
|
checkpoint-6888/trainer_state.json
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 24.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6888,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 24.0,
|
151 |
+
"learning_rate": 0.000125,
|
152 |
+
"loss": 2.2275,
|
153 |
+
"step": 6888
|
154 |
+
}
|
155 |
+
],
|
156 |
+
"logging_steps": 500,
|
157 |
+
"max_steps": 9184,
|
158 |
+
"num_input_tokens_seen": 0,
|
159 |
+
"num_train_epochs": 32,
|
160 |
+
"save_steps": 500,
|
161 |
+
"total_flos": 0.0,
|
162 |
+
"train_batch_size": 64,
|
163 |
+
"trial_name": null,
|
164 |
+
"trial_params": null
|
165 |
+
}
|
checkpoint-6888/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-7175/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70399412ee3a628d0f3b4610a75eb6d34bbbfd5af078a54428cf923428ede87c
|
3 |
+
size 90866120
|
checkpoint-7175/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35df210873e2c02a975d32ef197ce30b942c815e6ae93698e6ed9111688677e9
|
3 |
+
size 180607738
|
checkpoint-7175/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfad4aa84c0d92f9d5ccb73464e7c346e61b36684ece175fbbc2d6232d4d2ec1
|
3 |
+
size 14244
|
checkpoint-7175/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b28adf85aa6d40a127d685592ca8e5c171fb4571c03ae04230db692a8f1ffa7
|
3 |
+
size 1064
|
checkpoint-7175/trainer_state.json
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 25.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 7175,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 24.0,
|
151 |
+
"learning_rate": 0.000125,
|
152 |
+
"loss": 2.2275,
|
153 |
+
"step": 6888
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"epoch": 25.0,
|
157 |
+
"learning_rate": 0.000109375,
|
158 |
+
"loss": 2.2225,
|
159 |
+
"step": 7175
|
160 |
+
}
|
161 |
+
],
|
162 |
+
"logging_steps": 500,
|
163 |
+
"max_steps": 9184,
|
164 |
+
"num_input_tokens_seen": 0,
|
165 |
+
"num_train_epochs": 32,
|
166 |
+
"save_steps": 500,
|
167 |
+
"total_flos": 0.0,
|
168 |
+
"train_batch_size": 64,
|
169 |
+
"trial_name": null,
|
170 |
+
"trial_params": null
|
171 |
+
}
|
checkpoint-7175/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-7462/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93de1f250f3d2b213443d49f9a6bc58b9bc6136b5360cd3981314985ac99869e
|
3 |
+
size 90866120
|
checkpoint-7462/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95db1db9cba8df01031b08118df90149273beaa66c8a2f4420b573054b006036
|
3 |
+
size 180607738
|
checkpoint-7462/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:989c0e225ca35a25d7e972b25c493513a6c249b1701d9a4935578c8df15b4a84
|
3 |
+
size 14244
|
checkpoint-7462/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0535cd984cd25c62d05ba741b59307e6d473370910220c9493fb6f0fb158f2c7
|
3 |
+
size 1064
|
checkpoint-7462/trainer_state.json
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 26.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 7462,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 24.0,
|
151 |
+
"learning_rate": 0.000125,
|
152 |
+
"loss": 2.2275,
|
153 |
+
"step": 6888
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"epoch": 25.0,
|
157 |
+
"learning_rate": 0.000109375,
|
158 |
+
"loss": 2.2225,
|
159 |
+
"step": 7175
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"epoch": 26.0,
|
163 |
+
"learning_rate": 9.375e-05,
|
164 |
+
"loss": 2.2166,
|
165 |
+
"step": 7462
|
166 |
+
}
|
167 |
+
],
|
168 |
+
"logging_steps": 500,
|
169 |
+
"max_steps": 9184,
|
170 |
+
"num_input_tokens_seen": 0,
|
171 |
+
"num_train_epochs": 32,
|
172 |
+
"save_steps": 500,
|
173 |
+
"total_flos": 0.0,
|
174 |
+
"train_batch_size": 64,
|
175 |
+
"trial_name": null,
|
176 |
+
"trial_params": null
|
177 |
+
}
|
checkpoint-7462/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-7749/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21ce4eca180873fb16cdb6d9bff483078203550177dabd4ed16bf462c7493af4
|
3 |
+
size 90866120
|
checkpoint-7749/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:254e2ba1d029861e2f7e1b7d917e6ab876636c3c0af898b8979e9336459deaeb
|
3 |
+
size 180607738
|
checkpoint-7749/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eeca4e1dc911e6617de8c0b5c3b6ee333073453ada941b2b075b2c148e12374d
|
3 |
+
size 14244
|
checkpoint-7749/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4097bc439462cd2d9ca9794414c1789ad4482313b2479440f57e54bfa023df2
|
3 |
+
size 1064
|
checkpoint-7749/trainer_state.json
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 27.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 7749,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 24.0,
|
151 |
+
"learning_rate": 0.000125,
|
152 |
+
"loss": 2.2275,
|
153 |
+
"step": 6888
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"epoch": 25.0,
|
157 |
+
"learning_rate": 0.000109375,
|
158 |
+
"loss": 2.2225,
|
159 |
+
"step": 7175
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"epoch": 26.0,
|
163 |
+
"learning_rate": 9.375e-05,
|
164 |
+
"loss": 2.2166,
|
165 |
+
"step": 7462
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"epoch": 27.0,
|
169 |
+
"learning_rate": 7.8125e-05,
|
170 |
+
"loss": 2.2174,
|
171 |
+
"step": 7749
|
172 |
+
}
|
173 |
+
],
|
174 |
+
"logging_steps": 500,
|
175 |
+
"max_steps": 9184,
|
176 |
+
"num_input_tokens_seen": 0,
|
177 |
+
"num_train_epochs": 32,
|
178 |
+
"save_steps": 500,
|
179 |
+
"total_flos": 0.0,
|
180 |
+
"train_batch_size": 64,
|
181 |
+
"trial_name": null,
|
182 |
+
"trial_params": null
|
183 |
+
}
|
checkpoint-7749/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|
checkpoint-8036/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:094692f052bc3c1da865dd330ea4d8868877c2c651aa4728271543cff9649ebf
|
3 |
+
size 90866120
|
checkpoint-8036/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efd5c0d40f596b5cc141af43a5a42a15ea6711ede357930782af5e5cdd103831
|
3 |
+
size 180607738
|
checkpoint-8036/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55ad740d18cd84d3d6ab714ade3167df9b6d81cc8473d8c6af14b20d61391900
|
3 |
+
size 14244
|
checkpoint-8036/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edce3f06ab8ae28b57207a6ce227dcbc4fb37cdbf3e247d385951ac8ed165923
|
3 |
+
size 1064
|
checkpoint-8036/trainer_state.json
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 28.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 8036,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"learning_rate": 0.000484375,
|
14 |
+
"loss": 3.0823,
|
15 |
+
"step": 287
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"epoch": 2.0,
|
19 |
+
"learning_rate": 0.00046875,
|
20 |
+
"loss": 2.7242,
|
21 |
+
"step": 574
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"epoch": 3.0,
|
25 |
+
"learning_rate": 0.000453125,
|
26 |
+
"loss": 2.5348,
|
27 |
+
"step": 861
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"epoch": 4.0,
|
31 |
+
"learning_rate": 0.0004375,
|
32 |
+
"loss": 2.4455,
|
33 |
+
"step": 1148
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 5.0,
|
37 |
+
"learning_rate": 0.000421875,
|
38 |
+
"loss": 2.3794,
|
39 |
+
"step": 1435
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 6.0,
|
43 |
+
"learning_rate": 0.00040625000000000004,
|
44 |
+
"loss": 2.3375,
|
45 |
+
"step": 1722
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 7.0,
|
49 |
+
"learning_rate": 0.000390625,
|
50 |
+
"loss": 2.3262,
|
51 |
+
"step": 2009
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 8.0,
|
55 |
+
"learning_rate": 0.000375,
|
56 |
+
"loss": 2.3114,
|
57 |
+
"step": 2296
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 9.0,
|
61 |
+
"learning_rate": 0.000359375,
|
62 |
+
"loss": 2.2921,
|
63 |
+
"step": 2583
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"epoch": 10.0,
|
67 |
+
"learning_rate": 0.00034375,
|
68 |
+
"loss": 2.2918,
|
69 |
+
"step": 2870
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"epoch": 11.0,
|
73 |
+
"learning_rate": 0.000328125,
|
74 |
+
"loss": 2.2578,
|
75 |
+
"step": 3157
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"epoch": 12.0,
|
79 |
+
"learning_rate": 0.0003125,
|
80 |
+
"loss": 2.2693,
|
81 |
+
"step": 3444
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"epoch": 13.0,
|
85 |
+
"learning_rate": 0.000296875,
|
86 |
+
"loss": 2.2594,
|
87 |
+
"step": 3731
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 14.0,
|
91 |
+
"learning_rate": 0.00028125000000000003,
|
92 |
+
"loss": 2.2555,
|
93 |
+
"step": 4018
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 15.0,
|
97 |
+
"learning_rate": 0.000265625,
|
98 |
+
"loss": 2.2481,
|
99 |
+
"step": 4305
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"epoch": 16.0,
|
103 |
+
"learning_rate": 0.00025,
|
104 |
+
"loss": 2.2468,
|
105 |
+
"step": 4592
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"epoch": 17.0,
|
109 |
+
"learning_rate": 0.000234375,
|
110 |
+
"loss": 2.248,
|
111 |
+
"step": 4879
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 18.0,
|
115 |
+
"learning_rate": 0.00021875,
|
116 |
+
"loss": 2.2435,
|
117 |
+
"step": 5166
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 19.0,
|
121 |
+
"learning_rate": 0.00020312500000000002,
|
122 |
+
"loss": 2.2319,
|
123 |
+
"step": 5453
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 20.0,
|
127 |
+
"learning_rate": 0.0001875,
|
128 |
+
"loss": 2.2303,
|
129 |
+
"step": 5740
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 21.0,
|
133 |
+
"learning_rate": 0.000171875,
|
134 |
+
"loss": 2.2215,
|
135 |
+
"step": 6027
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 22.0,
|
139 |
+
"learning_rate": 0.00015625,
|
140 |
+
"loss": 2.2256,
|
141 |
+
"step": 6314
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"epoch": 23.0,
|
145 |
+
"learning_rate": 0.00014062500000000002,
|
146 |
+
"loss": 2.2257,
|
147 |
+
"step": 6601
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"epoch": 24.0,
|
151 |
+
"learning_rate": 0.000125,
|
152 |
+
"loss": 2.2275,
|
153 |
+
"step": 6888
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"epoch": 25.0,
|
157 |
+
"learning_rate": 0.000109375,
|
158 |
+
"loss": 2.2225,
|
159 |
+
"step": 7175
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"epoch": 26.0,
|
163 |
+
"learning_rate": 9.375e-05,
|
164 |
+
"loss": 2.2166,
|
165 |
+
"step": 7462
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"epoch": 27.0,
|
169 |
+
"learning_rate": 7.8125e-05,
|
170 |
+
"loss": 2.2174,
|
171 |
+
"step": 7749
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"epoch": 28.0,
|
175 |
+
"learning_rate": 6.25e-05,
|
176 |
+
"loss": 2.2188,
|
177 |
+
"step": 8036
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"logging_steps": 500,
|
181 |
+
"max_steps": 9184,
|
182 |
+
"num_input_tokens_seen": 0,
|
183 |
+
"num_train_epochs": 32,
|
184 |
+
"save_steps": 500,
|
185 |
+
"total_flos": 0.0,
|
186 |
+
"train_batch_size": 64,
|
187 |
+
"trial_name": null,
|
188 |
+
"trial_params": null
|
189 |
+
}
|
checkpoint-8036/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8d559a9f0f171908c0a1db0384f658c1e31689653c8277d9ca4a2848e36c6
|
3 |
+
size 4792
|