Spaces:

passaglia
/

yomikata-demo

Build error

File size: 2,139 Bytes

9aba307

from pathlib import Path

import pandas as pd

from config import config
from config.config import logger

pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)


def repair_long_vowels(kana: str, kanji: str = None) -> str:
    """Clean and normalize text

    Args:
        kana (str): input string
        kanji (str): input string, optional

    Returns:
        str: a cleaned string
    """

    reading = kana
    indices_of_dash = [pos for pos, char in enumerate(reading) if char == "ー"]

    # get rid of non-ambiguous dashes
    for index_of_dash in indices_of_dash:
        char_before_dash = reading[index_of_dash - 1]
        if char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
            reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
        elif char_before_dash in "しじみいきぎひびちぢぃ":
            reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]

    indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "ー"]
    if len(indices_of_not_dash) != len(reading):
        if not kanji:
            logger.info("Disambiguating this dash requires kanji")
            logger.info(f"Left dash in {reading}")
        else:
            try:
                candidate_pronunciations = list(pronunciation_df[kanji])
            except KeyError:
                candidate_pronunciations = []

            candidate_pronunciations = list(set(candidate_pronunciations))

            candidate_pronunciations = [
                x for x in candidate_pronunciations if len(x) == len(reading)
            ]
            candidate_pronunciations = [
                x
                for x in candidate_pronunciations
                if all([x[i] == reading[i] for i in indices_of_not_dash])
            ]

            if len(candidate_pronunciations) == 1:
                reading = candidate_pronunciations[0]
            else:
                pass
                # logger.warning(f"Left dashes in {kanji} {reading}")

    return reading