Spaces:
Build error
Build error
from pathlib import Path | |
import pandas as pd | |
from config import config | |
from config.config import logger | |
pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv")) | |
pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list) | |
def repair_long_vowels(kana: str, kanji: str = None) -> str: | |
"""Clean and normalize text | |
Args: | |
kana (str): input string | |
kanji (str): input string, optional | |
Returns: | |
str: a cleaned string | |
""" | |
reading = kana | |
indices_of_dash = [pos for pos, char in enumerate(reading) if char == "γΌ"] | |
# get rid of non-ambiguous dashes | |
for index_of_dash in indices_of_dash: | |
char_before_dash = reading[index_of_dash - 1] | |
if char_before_dash in "γ¬γ€γ₯γγγγγγγγ γ΅γΆγ·": | |
reading = reading[:index_of_dash] + "γ" + reading[index_of_dash + 1 :] | |
elif char_before_dash in "γγγΏγγγγ²γ³γ‘γ’γ": | |
reading = reading[:index_of_dash] + "γ" + reading[index_of_dash + 1 :] | |
indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "γΌ"] | |
if len(indices_of_not_dash) != len(reading): | |
if not kanji: | |
logger.info("Disambiguating this dash requires kanji") | |
logger.info(f"Left dash in {reading}") | |
else: | |
try: | |
candidate_pronunciations = list(pronunciation_df[kanji]) | |
except KeyError: | |
candidate_pronunciations = [] | |
candidate_pronunciations = list(set(candidate_pronunciations)) | |
candidate_pronunciations = [ | |
x for x in candidate_pronunciations if len(x) == len(reading) | |
] | |
candidate_pronunciations = [ | |
x | |
for x in candidate_pronunciations | |
if all([x[i] == reading[i] for i in indices_of_not_dash]) | |
] | |
if len(candidate_pronunciations) == 1: | |
reading = candidate_pronunciations[0] | |
else: | |
pass | |
# logger.warning(f"Left dashes in {kanji} {reading}") | |
return reading | |