yomikata-demo / yomikata /dataset /repair_long_vowels.py
Sam Passaglia
initial commit
9aba307
raw
history blame
2.14 kB
from pathlib import Path
import pandas as pd
from config import config
from config.config import logger
pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)
def repair_long_vowels(kana: str, kanji: str = None) -> str:
"""Clean and normalize text
Args:
kana (str): input string
kanji (str): input string, optional
Returns:
str: a cleaned string
"""
reading = kana
indices_of_dash = [pos for pos, char in enumerate(reading) if char == "γƒΌ"]
# get rid of non-ambiguous dashes
for index_of_dash in indices_of_dash:
char_before_dash = reading[index_of_dash - 1]
if char_before_dash in "ぬ぀γ₯γ‚€γ‚‹γγγ™γšγ‚†γ‚…γ΅γΆγ·":
reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
elif char_before_dash in "γ—γ˜γΏγ„γγŽγ²γ³γ‘γ’γƒ":
reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]
indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "γƒΌ"]
if len(indices_of_not_dash) != len(reading):
if not kanji:
logger.info("Disambiguating this dash requires kanji")
logger.info(f"Left dash in {reading}")
else:
try:
candidate_pronunciations = list(pronunciation_df[kanji])
except KeyError:
candidate_pronunciations = []
candidate_pronunciations = list(set(candidate_pronunciations))
candidate_pronunciations = [
x for x in candidate_pronunciations if len(x) == len(reading)
]
candidate_pronunciations = [
x
for x in candidate_pronunciations
if all([x[i] == reading[i] for i in indices_of_not_dash])
]
if len(candidate_pronunciations) == 1:
reading = candidate_pronunciations[0]
else:
pass
# logger.warning(f"Left dashes in {kanji} {reading}")
return reading