Sam Passaglia
initial commit
9aba307
raw
history blame
1.08 kB
"""sudachi.py
Data processing script for sudachi dictionary
"""
import warnings
from pathlib import Path
import pandas as pd
from config import config
from config.config import logger
warnings.filterwarnings("ignore")
def sudachi_data():
sudachi_file = list(Path(config.RAW_DATA_DIR, "sudachi").glob("*.csv"))
df = pd.DataFrame()
for file in sudachi_file:
logger.info(file.name)
# Load file
df = pd.concat(
[
df,
pd.read_csv(
file,
header=None,
),
]
)
df["surface"] = df[0].astype(str).str.strip()
df["kana"] = df[11].astype(str).str.strip()
df["type"] = df[5].astype(str).str.strip()
df = df[df["kana"] != "*"]
df = df[df["surface"] != df["kana"]]
df = df[df["type"] != "補助記号"]
df = df[["surface", "kana"]]
df.to_csv(Path(config.READING_DATA_DIR, "sudachi.csv"), index=False)
logger.info("✅ Processed sudachi data!")
if __name__ == "__main__":
sudachi_data()