xu-song commited on
Commit
47e1616
1 Parent(s): 0d55475
Files changed (2) hide show
  1. compression_app.py +10 -6
  2. compression_util.py +6 -3
compression_app.py CHANGED
@@ -43,12 +43,12 @@ Lossless tokenization preserves the exact original text, i.e. `decoded_text = in
43
 
44
  - **Compression Rate** <br>
45
  There are mainly two types of metric to represent the `input_text`:
46
- - `byte-level`: the number of bytes in the given text
47
- - `char-level`: the number of characters in the given text.
48
 
49
- To evaluate compression rate, simple metrics can be "how many bytes per token" or "how many chars per token". <br>
50
- In this leaderboard, we adopt more frequently used metric: "how many billion tokens per gigabytes corpus" and "how many chars
51
- per token", i.e. `b_tokens/g_bytes` and `char/token`.
52
  💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
53
  """
54
 
@@ -141,7 +141,11 @@ with gr.Blocks(theme=theme) as demo:
141
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
142
  )
143
 
144
- gr.Markdown("## 🏆 Compression Rate Leaderboard")
 
 
 
 
145
  search_bar = gr.Textbox(
146
  placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
147
  show_label=False,
 
43
 
44
  - **Compression Rate** <br>
45
  There are mainly two types of metric to represent the `input_text`:
46
+ - `char-level`: the number of characters in the given text
47
+ - `byte-level`: the number of bytes in the given text.
48
 
49
+ To evaluate compression rate, simple metrics can be "how many chars per token" or "how many bytes per token". <br>
50
+ In this leaderboard, we adopt more frequently used metric: "how many chars per token" and
51
+ "how many billion tokens per gigabytes corpus", i.e. `char/token` and `b_tokens/g_bytes`.
52
  💬 [Discussions is Welcome](https://huggingface.co/spaces/eson/tokenizer-arena/discussions)
53
  """
54
 
 
141
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
142
  )
143
 
144
+ gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
145
+ "The leaderboard aim to evaluate tokenizer performance on different languages.\n"
146
+ "Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
147
+ "Higher `char/token` means less words be segmented into subwords."
148
+ )
149
  search_bar = gr.Textbox(
150
  placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
151
  show_label=False,
compression_util.py CHANGED
@@ -295,9 +295,12 @@ def get_compression_leaderboard(
295
  if return_type == "dataframe":
296
  token_number_unit, file_size_unit = unit.split("/")
297
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
298
- stats = to_dataframe(stats, [unit, reverse_unit, "char/token"])
299
- stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
300
- stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={unit: f' ⬆️{unit}'}) # ⬇
 
 
 
301
  return stats
302
 
303
 
 
295
  if return_type == "dataframe":
296
  token_number_unit, file_size_unit = unit.split("/")
297
  reverse_unit = f"{file_size_unit}/{token_number_unit}"
298
+ stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
299
+ stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
300
+
301
+ # stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
302
+
303
+ stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
304
  return stats
305
 
306