Spaces:
Build error
Build error
"""sudachi.py | |
Data processing script for sudachi dictionary | |
""" | |
import warnings | |
from pathlib import Path | |
import pandas as pd | |
from config import config | |
from config.config import logger | |
warnings.filterwarnings("ignore") | |
def sudachi_data(): | |
sudachi_file = list(Path(config.RAW_DATA_DIR, "sudachi").glob("*.csv")) | |
df = pd.DataFrame() | |
for file in sudachi_file: | |
logger.info(file.name) | |
# Load file | |
df = pd.concat( | |
[ | |
df, | |
pd.read_csv( | |
file, | |
header=None, | |
), | |
] | |
) | |
df["surface"] = df[0].astype(str).str.strip() | |
df["kana"] = df[11].astype(str).str.strip() | |
df["type"] = df[5].astype(str).str.strip() | |
df = df[df["kana"] != "*"] | |
df = df[df["surface"] != df["kana"]] | |
df = df[df["type"] != "補助記号"] | |
df = df[["surface", "kana"]] | |
df.to_csv(Path(config.READING_DATA_DIR, "sudachi.csv"), index=False) | |
logger.info("✅ Processed sudachi data!") | |
if __name__ == "__main__": | |
sudachi_data() | |