Sam Passaglia
initial commit
9aba307
raw
history blame
3.14 kB
"""kwdlc.py
Data processing script for KWDLC files directly in the repository format
KWDLC repository: https://github.com/ku-nlp/KWDLC
"""
import warnings
from pathlib import Path
import pandas as pd
from speach import ttlig
from config import config
from config.config import logger
from yomikata import utils
warnings.filterwarnings("ignore")
def read_knp_file(filename: str):
with open(filename) as f:
contents = f.readlines()
ids = []
sentences = []
furiganas = []
sentence = ""
furigana = ""
for row in contents:
first_word = row.split(" ")[0]
if first_word in ["*", "+"]:
pass
elif first_word == "#":
sentence_id = row.split(" ")[1].split("S-ID:")[1]
elif first_word == "EOS\n":
sentence = utils.standardize_text(sentence)
furigana = utils.standardize_text(furigana)
if sentence == utils.remove_furigana(furigana):
sentences.append(sentence)
furiganas.append(furigana)
ids.append(sentence_id)
else:
logger.warning(
f"Dropping mismatched line \n Sentence: {sentence} \n Furigana: {furigana}"
)
sentence = ""
furigana = ""
else:
words = row.split(" ")
sentence += words[0]
if words[0] == words[1]:
furigana += words[0]
else:
furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()
assert len(ids) == len(sentences)
assert len(sentences) == len(furiganas)
return ids, sentences, furiganas # readings
def kwdlc_data():
"""Extract, load and transform the kwdlc data"""
# Extract sentences from the data files
knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))
all_ids = []
all_sentences = []
all_furiganas = []
for knp_file in knp_files:
ids, sentences, furiganas = read_knp_file(knp_file)
all_ids += ids
all_sentences += sentences
all_furiganas += furiganas
# construct dataframe
df = pd.DataFrame(
list(
zip(all_sentences, all_furiganas, all_ids)
), # all_readings, all_furiganas)),
columns=["sentence", "furigana", "sentenceid"],
)
# remove known errors
error_ids = [
"w201106-0000547376-1",
"w201106-0001768070-1-01",
"w201106-0000785999-1",
"w201106-0001500842-1",
"w201106-0000704257-1",
"w201106-0002300346-3",
"w201106-0001779669-3",
"w201106-0000259203-1",
]
df = df[~df["sentenceid"].isin(error_ids)]
df = df.drop_duplicates()
df["furigana"] = df["furigana"].apply(utils.standardize_text)
df["sentence"] = df["sentence"].apply(utils.standardize_text)
# Test
assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
# Output
df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)
logger.info("✅ Saved kwdlc data!")
if __name__ == "__main__":
kwdlc_data()