victormiller
commited on
Commit
•
407f850
1
Parent(s):
a5b96ac
Update README.md
Browse files
README.md
CHANGED
@@ -25,15 +25,15 @@ The following data mix was used to train K2 and achieve results in line with Lla
|
|
25 |
| pubmed-abstracts | 4.77B | 3x | 14.3B | 1.1% |
|
26 |
| uspto | 4.77B | 3x | 14.3B | 1.1% |
|
27 |
| pubmed-central | 26B | 1x | 26B | 2% |
|
28 |
-
| redpajama.arxiv | 27.3B | 1x | 27.3B | 2.1% |
|
29 |
| starcoder.spm | 67.6B | 0.5x | 33.8B | 2.6% |
|
30 |
| starcoder.fim | 67.6B | 0.5x | 33.8B | 2.6% |
|
31 |
-
| redpajama.stackexchange | 61.1B | 1x | 61.1B | 4.7% |
|
32 |
| starcoder | 132.6B | 0.5x | 66.3B | 5.1% |
|
33 |
| pile-of-law | 76.7B | 1x | 76.7B | 5.9% |
|
34 |
-
| redpajama.book | 80.6B | 1x | 80.6B | 6.2% |
|
35 |
| s2orc | 107.9B | 1x | 107.9B | 8.3% |
|
36 |
-
| redpajama.wikipedia | 22.1B | 6x | 132.6B | 10.2% |
|
37 |
| refinedweb | 612.3B | 1x | 612.3B | 47.1% |
|
38 |
| Totals | - | - | 1.3T | 100% |
|
39 |
|
|
|
25 |
| pubmed-abstracts | 4.77B | 3x | 14.3B | 1.1% |
|
26 |
| uspto | 4.77B | 3x | 14.3B | 1.1% |
|
27 |
| pubmed-central | 26B | 1x | 26B | 2% |
|
28 |
+
| [redpajama.arxiv](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | 27.3B | 1x | 27.3B | 2.1% |
|
29 |
| starcoder.spm | 67.6B | 0.5x | 33.8B | 2.6% |
|
30 |
| starcoder.fim | 67.6B | 0.5x | 33.8B | 2.6% |
|
31 |
+
| [redpajama.stackexchange](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | 61.1B | 1x | 61.1B | 4.7% |
|
32 |
| starcoder | 132.6B | 0.5x | 66.3B | 5.1% |
|
33 |
| pile-of-law | 76.7B | 1x | 76.7B | 5.9% |
|
34 |
+
| [redpajama.book](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | 80.6B | 1x | 80.6B | 6.2% |
|
35 |
| s2orc | 107.9B | 1x | 107.9B | 8.3% |
|
36 |
+
| [redpajama.wikipedia](https://huggingface.co/datasets/cerebras/SlimPajama-627B) | 22.1B | 6x | 132.6B | 10.2% |
|
37 |
| refinedweb | 612.3B | 1x | 612.3B | 47.1% |
|
38 |
| Totals | - | - | 1.3T | 100% |
|
39 |
|