yenniejun commited on
Commit
8c475b6
1 Parent(s): c4e64d6

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. MassiveDatasetValidationData.csv +0 -0
  3. app.py +13 -10
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ MassiveDatasetValidationData.csv filter=lfs diff=lfs merge=lfs -text
MassiveDatasetValidationData.csv CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -9,7 +9,7 @@ import seaborn as sns
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
12
- import random
13
 
14
  @st.cache_data
15
  def load_data():
@@ -31,6 +31,13 @@ def reload_example_text_data():
31
  # TODO allow new tokenizers from HF
32
  tokenizer_names_to_test = [
33
  "openai/gpt4",
 
 
 
 
 
 
 
34
  "xlm-roberta-base", # old style
35
  "bert-base-uncased", # old style
36
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
@@ -38,12 +45,16 @@ tokenizer_names_to_test = [
38
  "StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
39
  "google/flan-t5-base", # Flan T5 (better than T5), Google
40
  "facebook/mbart-large-50", # Facebook
41
- "facebook/nllb-200-distilled-600M", # Facebook
42
  "EleutherAI/gpt-neox-20b", # same as Pythia
43
  ]
44
 
45
  with st.sidebar:
46
 
 
 
 
 
 
47
 
48
  st.subheader('Tokenizer')
49
  # TODO multi-select tokenizers
@@ -82,14 +93,6 @@ with st.sidebar:
82
  show_hist = st.checkbox('Show histogram', value=False)
83
 
84
 
85
- st.subheader('About the project')
86
- with st.expander("All languages are NOT created (tokenized) equal!"):
87
-
88
- link="The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized)"
89
- st.markdown(link)
90
-
91
-
92
-
93
 
94
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
95
 
 
9
  import numpy as np
10
  import plotly.figure_factory as ff
11
  import plotly.express as px
12
+ import random, glob
13
 
14
  @st.cache_data
15
  def load_data():
 
31
  # TODO allow new tokenizers from HF
32
  tokenizer_names_to_test = [
33
  "openai/gpt4",
34
+ "Xenova/gpt-4o",
35
+ "Xenova/claude-tokenizer",
36
+ "CohereForAI/aya-101",
37
+ "meta-llama/Meta-Llama-3-70B",
38
+ "mistralai/Mixtral-8x22B-v0.1",
39
+ "google/gemma-7b",
40
+ "facebook/nllb-200-distilled-600M", # Facebook
41
  "xlm-roberta-base", # old style
42
  "bert-base-uncased", # old style
43
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
 
45
  "StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
46
  "google/flan-t5-base", # Flan T5 (better than T5), Google
47
  "facebook/mbart-large-50", # Facebook
 
48
  "EleutherAI/gpt-neox-20b", # same as Pythia
49
  ]
50
 
51
  with st.sidebar:
52
 
53
+ st.header('All languages are NOT created (tokenized) equal!')
54
+ link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
55
+ st.markdown(link)
56
+
57
+ st.divider()
58
 
59
  st.subheader('Tokenizer')
60
  # TODO multi-select tokenizers
 
93
  show_hist = st.checkbox('Show histogram', value=False)
94
 
95
 
 
 
 
 
 
 
 
 
96
 
97
  # dist_marginal = st.radio('Select distribution', options=['box', 'violin', 'rug'], horizontal=True)
98