Spaces:

passaglia
/

yomikata-demo

Build error

yomikata-demo / yomikata /dataset /repair_long_vowels.py

Sam Passaglia

initial commit

9aba307 over 1 year ago

2.14 kB

	from pathlib import Path

	import pandas as pd

	from config import config
	from config.config import logger

	pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
	pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)


	def repair_long_vowels(kana: str, kanji: str = None) -> str:
	"""Clean and normalize text

	Args:
	kana (str): input string
	kanji (str): input string, optional

	Returns:
	str: a cleaned string
	"""

	reading = kana
	indices_of_dash = [pos for pos, char in enumerate(reading) if char == "ー"]

	# get rid of non-ambiguous dashes
	for index_of_dash in indices_of_dash:
	char_before_dash = reading[index_of_dash - 1]
	if char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
	reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
	elif char_before_dash in "しじみいきぎひびちぢぃ":
	reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]

	indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "ー"]
	if len(indices_of_not_dash) != len(reading):
	if not kanji:
	logger.info("Disambiguating this dash requires kanji")
	logger.info(f"Left dash in {reading}")
	else:
	try:
	candidate_pronunciations = list(pronunciation_df[kanji])
	except KeyError:
	candidate_pronunciations = []

	candidate_pronunciations = list(set(candidate_pronunciations))

	candidate_pronunciations = [
	x for x in candidate_pronunciations if len(x) == len(reading)
	]
	candidate_pronunciations = [
	x
	for x in candidate_pronunciations
	if all([x[i] == reading[i] for i in indices_of_not_dash])
	]

	if len(candidate_pronunciations) == 1:
	reading = candidate_pronunciations[0]
	else:
	pass
	# logger.warning(f"Left dashes in {kanji} {reading}")

	return reading