Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- .gitattributes +1 -0
- MassiveDatasetValidationData.csv +0 -0
- app.py +13 -10
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
MassiveDatasetValidationData.csv filter=lfs diff=lfs merge=lfs -text
|
MassiveDatasetValidationData.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -9,7 +9,7 @@ import seaborn as sns
|
|
9 |
import numpy as np
|
10 |
import plotly.figure_factory as ff
|
11 |
import plotly.express as px
|
12 |
-
import random
|
13 |
|
14 |
@st.cache_data
|
15 |
def load_data():
|
@@ -31,6 +31,13 @@ def reload_example_text_data():
|
|
31 |
# TODO allow new tokenizers from HF
|
32 |
tokenizer_names_to_test = [
|
33 |
"openai/gpt4",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"xlm-roberta-base", # old style
|
35 |
"bert-base-uncased", # old style
|
36 |
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
@@ -38,12 +45,16 @@ tokenizer_names_to_test = [
|
|
38 |
"StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
|
39 |
"google/flan-t5-base", # Flan T5 (better than T5), Google
|
40 |
"facebook/mbart-large-50", # Facebook
|
41 |
-
"facebook/nllb-200-distilled-600M", # Facebook
|
42 |
"EleutherAI/gpt-neox-20b", # same as Pythia
|
43 |
]
|
44 |
|
45 |
with st.sidebar:
|
46 |
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
st.subheader('Tokenizer')
|
49 |
# TODO multi-select tokenizers
|
@@ -82,14 +93,6 @@ with st.sidebar:
|
|
82 |
show_hist = st.checkbox('Show histogram', value=False)
|
83 |
|
84 |
|
85 |
-
st.subheader('About the project')
|
86 |
-
with st.expander("All languages are NOT created (tokenized) equal!"):
|
87 |
-
|
88 |
-
link="The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized)"
|
89 |
-
st.markdown(link)
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
|
95 |
|
|
|
9 |
import numpy as np
|
10 |
import plotly.figure_factory as ff
|
11 |
import plotly.express as px
|
12 |
+
import random, glob
|
13 |
|
14 |
@st.cache_data
|
15 |
def load_data():
|
|
|
31 |
# TODO allow new tokenizers from HF
|
32 |
tokenizer_names_to_test = [
|
33 |
"openai/gpt4",
|
34 |
+
"Xenova/gpt-4o",
|
35 |
+
"Xenova/claude-tokenizer",
|
36 |
+
"CohereForAI/aya-101",
|
37 |
+
"meta-llama/Meta-Llama-3-70B",
|
38 |
+
"mistralai/Mixtral-8x22B-v0.1",
|
39 |
+
"google/gemma-7b",
|
40 |
+
"facebook/nllb-200-distilled-600M", # Facebook
|
41 |
"xlm-roberta-base", # old style
|
42 |
"bert-base-uncased", # old style
|
43 |
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
45 |
"StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
|
46 |
"google/flan-t5-base", # Flan T5 (better than T5), Google
|
47 |
"facebook/mbart-large-50", # Facebook
|
|
|
48 |
"EleutherAI/gpt-neox-20b", # same as Pythia
|
49 |
]
|
50 |
|
51 |
with st.sidebar:
|
52 |
|
53 |
+
st.header('All languages are NOT created (tokenized) equal!')
|
54 |
+
link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
|
55 |
+
st.markdown(link)
|
56 |
+
|
57 |
+
st.divider()
|
58 |
|
59 |
st.subheader('Tokenizer')
|
60 |
# TODO multi-select tokenizers
|
|
|
93 |
show_hist = st.checkbox('Show histogram', value=False)
|
94 |
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
# dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
|
98 |
|