Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / utils /lang_util_2.py

xu-song

update

f331792 7 months ago

raw

history blame

3.05 kB

	"""
	日语、韩语等
	https://www.cnblogs.com/luoganttcc/p/16605150.html
	https://zhuanlan.zhihu.com/p/618684374
	- https://zhuanlan.zhihu.com/p/84625185 赞


	## 相关包

	import opencc
	import langid
	imort langdetect
	https://github.com/pemistahl/lingua-py
	- 原理：


	"""



	from zhon.hanzi import punctuation as zh_punc

	def is_zh_char(uchar):
	"""
	https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
	re.compile("([\u4E00-\u9FD5]+)", re.U)
	"""
	return u'\u4e00' <= uchar <= u'\u9fa5'

	def has_zh_punc(text):
	"""
	是否包含中文标点
	"""
	return any(ch in zh_punc for ch in text)


	def has_zh(text):
	""" contains Chinese characters """
	return any(is_zh_char(ch) for ch in text)


	def get_zh_count(text):
	return sum([is_zh_char(uchar) for uchar in text])


	def is_all_zh(text):
	return all(is_zh_char(char) for char in text)


	def is_all_en(text):
	return text.encode('utf-8').isalpha()




	ranges = [
	{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
	{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
	{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
	{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
	{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana 日本平假名 96个
	{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana 日语片假名 96个
	{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
	{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, # 中文 u"\u4e00"-'\u9fa5'，
	{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, #
	{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
	{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
	{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
	{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
	]

	# 韩语 [\uac00-\ud7ff]


	def is_cjk(char):
	"""
	CJK（Chinese、Japanese、Korean）
	日语中有很多汉字，日本汉字超过2万。
	韩语有谚文，超过50个，有朝鲜汉字超过2万。
	"""
	return any([range["from"] <= ord(char) <= range["to"] for range in ranges])


	def cjk_substrings(string):
	i = 0
	while i < len(string):
	if is_cjk(string[i]):
	start = i
	while is_cjk(string[i]): i += 1
	yield string[start:i]
	i += 1


	def aa():
	# string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
	for idx, item in enumerate(ranges):
	print(idx, end=": ")
	for j in range(10):
	print(chr(item["from"] + j), end=", ")
	print("")
	# for sub in cjk_substrings(string):
	# string = string.replace(sub, "(" + sub + ")")
	# print(string)


	def is_traditional_chinese(text):
	cc = opencc.OpenCC('t2s')
	converted_text = cc.convert(text)
	if converted_text != text:
	return True
	return False



	# aa()