DFofanov78 commited on
Commit
63ed2b2
1 Parent(s): 758a637

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +7 -8
  3. config.json +2 -11
  4. merges.txt +1 -1
  5. pytorch_model.bin +2 -2
  6. vocab.json +0 -0
.gitattributes CHANGED
@@ -6,3 +6,4 @@
6
  *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
 
 
6
  *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -7,19 +7,18 @@ tags:
7
  thumbnail: "https://github.com/sberbank-ai/ru-gpts"
8
  ---
9
 
10
- # rugpt3medium\_based\_on\_gpt2
11
  The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
12
 
 
13
 
14
- The model was pretrained with sequence length 1024 using the Transformers library by the [SberDevices](https://sberdevices.ru/) team on 80B tokens for 3 epochs. After that, the model was finetuned with the context size of 2048 tokens.
15
-
16
- Total training time was around 16 days on 64 GPUs.
17
- The final perplexity on the test set is `17.4`.
18
 
19
  # Authors
20
  + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
21
- + Dmitry Zmitrovich
22
-
23
  # Cite us
24
  ```
25
  @misc{zmitrovich2023family,
@@ -30,4 +29,4 @@ The final perplexity on the test set is `17.4`.
30
  archivePrefix={arXiv},
31
  primaryClass={cs.CL}
32
  }
33
- ```
 
7
  thumbnail: "https://github.com/sberbank-ai/ru-gpts"
8
  ---
9
 
10
+ # rugpt3large\_based\_on\_gpt2
11
  The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
12
 
13
+ The model was trained with sequence length 1024 using transformers lib by the [SberDevices](https://sberdevices.ru/) team on 80B tokens for 3 epochs. After that, the model was finetuned 1 epoch with sequence length 2048.
14
 
15
+ Total training time was around 14 days on 128 GPUs for 1024 context and a few days on 16 GPUs for 2048 context.
16
+ The final perplexity on the test set is `13.6`.
 
 
17
 
18
  # Authors
19
  + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
20
+ + Dmitry Zmitrovich
21
+
22
  # Cite us
23
  ```
24
  @misc{zmitrovich2023family,
 
29
  archivePrefix={arXiv},
30
  primaryClass={cs.CL}
31
  }
32
+ ```
config.json CHANGED
@@ -7,31 +7,22 @@
7
  "bos_token_id": 1,
8
  "embd_pdrop": 0.1,
9
  "eos_token_id": 2,
10
- "id2label": {
11
- "0": "LABEL_0"
12
- },
13
  "initializer_range": 0.02,
14
- "label2id": {
15
- "LABEL_0": 0
16
- },
17
  "layer_norm_epsilon": 1e-05,
18
  "model_type": "gpt2",
19
  "n_ctx": 2048,
20
- "n_embd": 1024,
21
  "n_head": 16,
22
  "n_inner": null,
23
  "n_layer": 24,
24
  "n_positions": 2048,
25
- "n_special": 0,
26
- "output_past": true,
27
  "pad_token_id": 0,
28
- "predict_special_tokens": true,
29
  "resid_pdrop": 0.1,
30
  "summary_activation": null,
31
  "summary_first_dropout": 0.1,
32
  "summary_proj_to_labels": true,
33
  "summary_type": "cls_index",
34
  "summary_use_proj": true,
35
- "use_cache": true,
36
  "vocab_size": 50257
37
  }
 
7
  "bos_token_id": 1,
8
  "embd_pdrop": 0.1,
9
  "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
 
 
11
  "initializer_range": 0.02,
 
 
 
12
  "layer_norm_epsilon": 1e-05,
13
  "model_type": "gpt2",
14
  "n_ctx": 2048,
15
+ "n_embd": 1536,
16
  "n_head": 16,
17
  "n_inner": null,
18
  "n_layer": 24,
19
  "n_positions": 2048,
 
 
20
  "pad_token_id": 0,
 
21
  "resid_pdrop": 0.1,
22
  "summary_activation": null,
23
  "summary_first_dropout": 0.1,
24
  "summary_proj_to_labels": true,
25
  "summary_type": "cls_index",
26
  "summary_use_proj": true,
 
27
  "vocab_size": 50257
28
  }
merges.txt CHANGED
@@ -1,4 +1,4 @@
1
- #version: 0.2 - Trained by `huggingface/tokenizers`
2
  Ġ Ð
3
  Ð ¾
4
  Ð µ
 
1
+ #version: 0.2
2
  Ġ Ð
3
  Ð ¾
4
  Ð µ
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b90e512e3d01d703b231a0cb160d16c025e1f1175e06c3dead5b6b1c75383f28
3
- size 1730074771
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce2ca7dbc12badb0a610df6f853168d6d9446a3b51e13e8af9186220eacc05cb
3
+ size 3141928084
vocab.json CHANGED
The diff for this file is too large to render. See raw diff