Spaces:
Build error
Build error
File size: 1,598 Bytes
9aba307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from pathlib import Path
import jaconv
import pandas as pd
from tqdm import tqdm
from config import config
from config.config import logger
from yomikata import utils
def pronunciation_data():
data_files = list(Path(config.READING_DATA_DIR).glob("*.csv"))
df = pd.DataFrame()
for file in data_files:
if (file.name == "all.csv") or (file.name == "ambiguous.csv"):
continue
output_df = pd.read_csv(file)
df = pd.concat([df, output_df])
df["surface"] = df["surface"].astype(str).str.strip()
df["kana"] = df["kana"].astype(str).str.strip()
tqdm.pandas()
df["kana"] = df["kana"].progress_apply(utils.standardize_text)
df["surface"] = df["surface"].progress_apply(utils.standardize_text)
df["kana"] = df.progress_apply(lambda row: jaconv.kata2hira(row["kana"]), axis=1)
df = df[df["surface"] != df["kana"]]
df = df[df["kana"] != ""]
df = df[df["surface"].progress_apply(utils.has_kanji)]
df = df.loc[~df["surface"].str.contains(r"[〜〜()\)\(\*]\.")]
df = df[["surface", "kana"]]
df = df.drop_duplicates()
df.to_csv(Path(config.READING_DATA_DIR, "all.csv"), index=False)
logger.info("✅ Merged all the pronunciation data!")
# merged_df = (
# df.groupby("surface")["kana"]
# .apply(list)
# .reset_index(name="pronunciations")
# )
# ambiguous_df = merged_df[merged_df["pronunciations"].apply(len) > 1]
# ambiguous_df.to_csv(Path(config.READING_DATA_DIR, "ambiguous.csv"), index=False)
if __name__ == "__main__":
pronunciation_data()
|