File size: 2,139 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from pathlib import Path

import pandas as pd

from config import config
from config.config import logger

pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)


def repair_long_vowels(kana: str, kanji: str = None) -> str:
    """Clean and normalize text

    Args:
        kana (str): input string
        kanji (str): input string, optional

    Returns:
        str: a cleaned string
    """

    reading = kana
    indices_of_dash = [pos for pos, char in enumerate(reading) if char == "γƒΌ"]

    # get rid of non-ambiguous dashes
    for index_of_dash in indices_of_dash:
        char_before_dash = reading[index_of_dash - 1]
        if char_before_dash in "ぬ぀γ₯γ‚€γ‚‹γγγ™γšγ‚†γ‚…γ΅γΆγ·":
            reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
        elif char_before_dash in "γ—γ˜γΏγ„γγŽγ²γ³γ‘γ’γƒ":
            reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]

    indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "γƒΌ"]
    if len(indices_of_not_dash) != len(reading):
        if not kanji:
            logger.info("Disambiguating this dash requires kanji")
            logger.info(f"Left dash in {reading}")
        else:
            try:
                candidate_pronunciations = list(pronunciation_df[kanji])
            except KeyError:
                candidate_pronunciations = []

            candidate_pronunciations = list(set(candidate_pronunciations))

            candidate_pronunciations = [
                x for x in candidate_pronunciations if len(x) == len(reading)
            ]
            candidate_pronunciations = [
                x
                for x in candidate_pronunciations
                if all([x[i] == reading[i] for i in indices_of_not_dash])
            ]

            if len(candidate_pronunciations) == 1:
                reading = candidate_pronunciations[0]
            else:
                pass
                # logger.warning(f"Left dashes in {kanji} {reading}")

    return reading