Upload folder using huggingface_hub

Files changed (6) hide show

.gitattributes CHANGED Viewed

@@ -6,3 +6,4 @@
 *.tar.gz filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text

 *.tar.gz filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -7,19 +7,18 @@ tags:
 thumbnail: "https://github.com/sberbank-ai/ru-gpts"
 ---
-# rugpt3medium\_based\_on\_gpt2
 The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
-The model was pretrained with sequence length 1024 using the Transformers library by the [SberDevices](https://sberdevices.ru/) team on 80B tokens for 3 epochs. After that, the model was finetuned with the context size of 2048 tokens.
-Total training time was around 16 days on 64 GPUs.
-The final perplexity on the test set is `17.4`.
 # Authors
 + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
-  + Dmitry Zmitrovich
 # Cite us
 ```
 @misc{zmitrovich2023family,
@@ -30,4 +29,4 @@ The final perplexity on the test set is `17.4`.
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
-```

 thumbnail: "https://github.com/sberbank-ai/ru-gpts"
 ---
+# rugpt3large\_based\_on\_gpt2
 The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
+The model was trained with sequence length 1024 using transformers lib by the [SberDevices](https://sberdevices.ru/) team on 80B tokens for 3 epochs. After that, the model was finetuned 1 epoch with sequence length 2048.
+Total training time was around 14 days on 128 GPUs for 1024 context and a few days on 16 GPUs for 2048 context.
+The final perplexity on the test set is `13.6`.
 # Authors
 + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
+  + Dmitry Zmitrovich
 # Cite us
 ```
 @misc{zmitrovich2023family,
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
+```

config.json CHANGED Viewed

@@ -7,31 +7,22 @@
   "bos_token_id": 1,
   "embd_pdrop": 0.1,
   "eos_token_id": 2,
-  "id2label": {
-    "0": "LABEL_0"
-  },
   "initializer_range": 0.02,
-  "label2id": {
-    "LABEL_0": 0
-  },
   "layer_norm_epsilon": 1e-05,
   "model_type": "gpt2",
   "n_ctx": 2048,
-  "n_embd": 1024,
   "n_head": 16,
   "n_inner": null,
   "n_layer": 24,
   "n_positions": 2048,
-  "n_special": 0,
-  "output_past": true,
   "pad_token_id": 0,
-  "predict_special_tokens": true,
   "resid_pdrop": 0.1,
   "summary_activation": null,
   "summary_first_dropout": 0.1,
   "summary_proj_to_labels": true,
   "summary_type": "cls_index",
   "summary_use_proj": true,
-  "use_cache": true,
   "vocab_size": 50257
 }

   "bos_token_id": 1,
   "embd_pdrop": 0.1,
   "eos_token_id": 2,
+  "gradient_checkpointing": false,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,
   "model_type": "gpt2",
   "n_ctx": 2048,
+  "n_embd": 1536,
   "n_head": 16,
   "n_inner": null,
   "n_layer": 24,
   "n_positions": 2048,
   "pad_token_id": 0,
   "resid_pdrop": 0.1,
   "summary_activation": null,
   "summary_first_dropout": 0.1,
   "summary_proj_to_labels": true,
   "summary_type": "cls_index",
   "summary_use_proj": true,
   "vocab_size": 50257
 }

merges.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-#version: 0.2 - Trained by `huggingface/tokenizers`
 Ġ Ð
 Ð ¾
 Ð µ

+#version: 0.2
 Ġ Ð
 Ð ¾
 Ð µ

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b90e512e3d01d703b231a0cb160d16c025e1f1175e06c3dead5b6b1c75383f28
-size 1730074771

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce2ca7dbc12badb0a610df6f853168d6d9446a3b51e13e8af9186220eacc05cb
+size 3141928084

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff