Spaces:

passaglia
/

yomikata-demo

Build error

File size: 3,141 Bytes

9aba307

"""kwdlc.py
Data processing script for KWDLC files directly in the repository format
KWDLC repository: https://github.com/ku-nlp/KWDLC
"""

import warnings
from pathlib import Path

import pandas as pd
from speach import ttlig

from config import config
from config.config import logger
from yomikata import utils

warnings.filterwarnings("ignore")


def read_knp_file(filename: str):
    with open(filename) as f:
        contents = f.readlines()

    ids = []
    sentences = []
    furiganas = []
    sentence = ""
    furigana = ""
    for row in contents:
        first_word = row.split(" ")[0]
        if first_word in ["*", "+"]:
            pass
        elif first_word == "#":
            sentence_id = row.split(" ")[1].split("S-ID:")[1]
        elif first_word == "EOS\n":
            sentence = utils.standardize_text(sentence)
            furigana = utils.standardize_text(furigana)
            if sentence == utils.remove_furigana(furigana):
                sentences.append(sentence)
                furiganas.append(furigana)
                ids.append(sentence_id)
            else:
                logger.warning(
                    f"Dropping mismatched line \n Sentence: {sentence} \n  Furigana: {furigana}"
                )
            sentence = ""
            furigana = ""
        else:
            words = row.split(" ")
            sentence += words[0]
            if words[0] == words[1]:
                furigana += words[0]
            else:
                furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()

    assert len(ids) == len(sentences)
    assert len(sentences) == len(furiganas)
    return ids, sentences, furiganas  # readings


def kwdlc_data():
    """Extract, load and transform the kwdlc data"""

    # Extract sentences from the data files
    knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))

    all_ids = []
    all_sentences = []
    all_furiganas = []
    for knp_file in knp_files:
        ids, sentences, furiganas = read_knp_file(knp_file)
        all_ids += ids
        all_sentences += sentences
        all_furiganas += furiganas

    # construct dataframe
    df = pd.DataFrame(
        list(
            zip(all_sentences, all_furiganas, all_ids)
        ),  # all_readings, all_furiganas)),
        columns=["sentence", "furigana", "sentenceid"],
    )

    # remove known errors
    error_ids = [
        "w201106-0000547376-1",
        "w201106-0001768070-1-01",
        "w201106-0000785999-1",
        "w201106-0001500842-1",
        "w201106-0000704257-1",
        "w201106-0002300346-3",
        "w201106-0001779669-3",
        "w201106-0000259203-1",
    ]

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)
    # Test
    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()

    # Output
    df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)

    logger.info("✅ Saved kwdlc data!")


if __name__ == "__main__":
    kwdlc_data()