Spaces:
Running
Running
""" | |
中文数据:clue superclue | |
英文数据:glue cnn_dailymail gigaword | |
代码数据: | |
数字: | |
## 参考 | |
- https://github.com/baichuan-inc/Baichuan-7B 记录了不同分词器的压缩率 | |
- 指标:猜测是 n_tokens/n_chars (baichuan小,说明百川token少,压缩率高) | |
- Baichuan 0.73; llama 1.31; | |
- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md 记录了不同分词器的压缩率 | |
- 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) , | |
- Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate) | |
- 中文: llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9; qwen7b 0.95 | |
- 英文: | |
- 指标:猜测是 n_tokens / n_tokens_xlmR | |
- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134 | |
- 有压缩率的计算方式 | |
- https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer | |
- 记录了不同分词器的压缩率 | |
- 指标: | |
- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24 | |
- 有压缩率的计算方式 = {n_chars} / {n_tokens} | |
- | |
- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354 | |
- 这个可能不是 | |
- https://github.com/bojone/bytepiece/blob/main/README_en.md | |
- "bytes/token": the average number of bytes per token | |
- Getting the most out of your tokenizer for pre-training and domain adaptation 👍 | |
- 定义: | |
- NSL: 两个分词器的编码长度 比例,通常以 llama为基准 | |
- average number of bytes per token. {n_bytes} / {n_tokens} | |
- higher compression rate -- | |
- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py | |
- 定义:{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0,用 % 来表示。也有>1的情况。 | |
- | |
- {Compressed Size} 指的是? | |
- 这里的压缩指的是 模型参数相关的。 | |
""" | |
import json | |
import os | |
import pandas as pd | |
from datasets import load_dataset | |
from utils.log_util import logger | |
from vocab import load_tokener | |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
def get_n_bytes_of_string(string_text): | |
n_bytes = len(string_text.encode("utf-8")) | |
return n_bytes | |
def unit_convertor(stat, unit): | |
n_tokens = stat["n_tokens"] | |
n_chars = stat["n_chars"] | |
n_bytes = stat["n_bytes"] | |
n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000) | |
n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000) | |
n_bytes_in_mb = n_bytes / (1024 * 1024) | |
n_bytes_in_gb = n_bytes_in_mb / 1024 | |
n_bytes_in_tb = n_bytes_in_gb / 1024 | |
# n_chars_in_billion = n_chars / (1000 * 1000 * 1000) | |
if unit == "n_tokens/n_bytes": | |
value = n_tokens / n_bytes | |
elif unit == "n_chars/n_tokens": # 重要:平均一个token包含多少个字符。 | |
value = n_chars / n_tokens | |
elif unit == "n_tokens/n_chars": # 一个中文汉字需要几个token? | |
value = n_tokens / n_chars | |
elif unit == "g_bytes/b_tokens": | |
value = n_bytes_in_gb / n_tokens_in_billion | |
elif unit == "t_bytes/t_tokens": # 重要: | |
value = n_bytes_in_tb / n_tokens_in_trillion | |
elif unit == "b_tokens/g_bytes": | |
value = n_tokens_in_billion / n_bytes_in_gb | |
else: | |
raise "measure not support" | |
return round(value, 2) | |
all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"] | |
def pprint(stats): | |
table = [] | |
for tokenizer_name, stat in stats.items(): | |
columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]} | |
for unit in all_units: | |
if unit not in stat: | |
columns[unit] = unit_convertor(stat, unit) | |
else: | |
pass | |
table.append(columns) | |
df = pd.DataFrame(table) | |
# print(df.to_markdown(index=False, tablefmt='fancy_grid')) | |
logger.info(df.to_markdown(index=False)) | |
return | |
cache = {} | |
def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): | |
""" | |
这个要独立的cache,因为速度慢。 | |
:param tokenizer: | |
:param lang: | |
:param cache_dir: | |
:return: | |
""" | |
def _tokenize(tokenizer, dataset): | |
n_tokens = 0 | |
n_chars = 0 | |
n_bytes = 0 | |
for item in dataset: | |
text = item["text"] | |
n_bytes += get_n_bytes_of_string(text) | |
n_chars += len(text) | |
encodings = tokenizer.encode(text) | |
n_tokens += len(encodings) | |
stat = { | |
"vocab_size": tokenizer.vocab_size, | |
"n_bytes": n_bytes, | |
"n_tokens": n_tokens, | |
"n_chars": n_chars, | |
} | |
return stat | |
tokenizer_name = tokenizer.alias | |
lang = lang.replace("cc100-", "") | |
cache_id = f"{tokenizer_name}.{lang}" | |
# L1: in-memory cache | |
if cache_id in cache: | |
logger.info(f"loading {cache_id} from in-memory cache") | |
return cache[cache_id] | |
# L2: file cache | |
cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}") | |
os.makedirs(cache_dir, exist_ok=True) | |
cache_path = os.path.join(cache_dir, f"{cache_id}.json") | |
if os.path.exists(cache_path): | |
logger.info(f"loading {cache_id} from file cache") | |
stat = json.load(open(cache_path, "r", encoding="utf-8")) | |
cache[cache_id] = stat | |
return stat | |
# tokenize corpus | |
dataset = load_dataset("eson/cc100-samples", lang, split="train") | |
stat = _tokenize(tokenizer, dataset) | |
logger.info(f"saving {cache_id} to {cache_path}") | |
json.dump(stat, open(cache_path, "w", encoding="utf-8")) | |
logger.info(f"saving {cache_id} to in-memory cache") | |
cache[cache_id] = stat | |
return stat | |
def main(): | |
from vocab import all_tokenizers | |
stats = {} | |
for lang in ["en", "zh-Hans"]: | |
print("###" * 10 + lang) | |
for tokenizer_name in ['llama', 'llama2', 'llama3']: | |
# for tokenizer_name in all_tokenizers: | |
tokenizer = load_tokener(tokenizer_name) | |
stat = tokenize_corpus(tokenizer, lang) | |
# ["qwen1_5_14b_chat", "gpt_35_turbo",]: | |
stats[tokenizer_name] = stat | |
pprint(stats) | |
if __name__ == "__main__": | |
main() | |