Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / examples.py

xu-song

update

1f833af 9 months ago

raw

history blame

2.88 kB

	"""

	## characters

	- alphanumeric characters
	- numeric characters
	- special characters: A special character is a character that is not an alphabetic or numeric character.
	- ASCII control characters
	- punctuation marks
	- accent marks
	- 数学符号
	- whitespace:
	- https://en.wikipedia.org/wiki/Whitespace_character
	- https://emptycharacter.com/


	https://www.computerhope.com/jargon/s/specchar.htm
	"""

	examples = {
	"en": [
	["number: (10086 + 98) = 100184", "llama", "bloom"],
	["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
	# ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
	["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
	["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
	],
	"zh": [
	["空格测试： 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
	["标点测试：，。！？；", "baichuan_7b", "llama"],
	["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
	["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
	["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
	]
	}

	more_examples = [
	# bert VS clue
	# bert系列
	("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug， bert VS clue

	# llama系列 (基于sentencepiece)
	("baichuan", "baichuan2", "baichuan2支持多空格，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
	("llama", "baichuan2", "baichuan2支持多空格，多个换行\n\n"),
	("llama", "chinese_llama2", ""),
	("chinese_llama", "chinese_llama2", ""),

	# glm系列（基于sentencepiece）
	("glm", "chatglm1", ""),
	("chatglm1", "chatglm2", ""),

	# gpt2系列
	("gpt2", "moss", ""),
	("", "", ""),

	# openai系列（tiktoken）
	("qwen", "gpt_35_turbo", ""),

	]

	lang = "en"

	example_types = [t[0].split(":")[0] for t in examples[lang]]


	def example_fn(example_idx):
	return examples[lang][example_idx]


	def get_more_example():
	import urllib.parse
	url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
	for tokenizer1, tokenizer2, text in more_examples:
	full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
	print(full_url)


	if __name__ == "__main__":
	get_more_example()