Spaces:
Build error
Build error
File size: 2,139 Bytes
9aba307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from pathlib import Path
import pandas as pd
from config import config
from config.config import logger
pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)
def repair_long_vowels(kana: str, kanji: str = None) -> str:
"""Clean and normalize text
Args:
kana (str): input string
kanji (str): input string, optional
Returns:
str: a cleaned string
"""
reading = kana
indices_of_dash = [pos for pos, char in enumerate(reading) if char == "γΌ"]
# get rid of non-ambiguous dashes
for index_of_dash in indices_of_dash:
char_before_dash = reading[index_of_dash - 1]
if char_before_dash in "γ¬γ€γ₯γγγγγγγγ
γ΅γΆγ·":
reading = reading[:index_of_dash] + "γ" + reading[index_of_dash + 1 :]
elif char_before_dash in "γγγΏγγγγ²γ³γ‘γ’γ":
reading = reading[:index_of_dash] + "γ" + reading[index_of_dash + 1 :]
indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "γΌ"]
if len(indices_of_not_dash) != len(reading):
if not kanji:
logger.info("Disambiguating this dash requires kanji")
logger.info(f"Left dash in {reading}")
else:
try:
candidate_pronunciations = list(pronunciation_df[kanji])
except KeyError:
candidate_pronunciations = []
candidate_pronunciations = list(set(candidate_pronunciations))
candidate_pronunciations = [
x for x in candidate_pronunciations if len(x) == len(reading)
]
candidate_pronunciations = [
x
for x in candidate_pronunciations
if all([x[i] == reading[i] for i in indices_of_not_dash])
]
if len(candidate_pronunciations) == 1:
reading = candidate_pronunciations[0]
else:
pass
# logger.warning(f"Left dashes in {kanji} {reading}")
return reading
|