Spaces:
Build error
Build error
"""unidic.py | |
Data processing script for unidic dictionary | |
""" | |
import warnings | |
from pathlib import Path | |
import pandas as pd | |
from config import config | |
from config.config import logger | |
warnings.filterwarnings("ignore") | |
def unidic_data(): | |
"""Extract, load and transform the unidic data""" | |
# Extract sentences from the data files | |
unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0] | |
# Load file | |
df = pd.read_csv( | |
unidic_file, | |
header=None, | |
names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType " | |
"cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType " | |
"fForm iConType fConType type kana kanaBase form formBase aType aConType " | |
"aModType lid lemma_id".split(" "), | |
) | |
df["surface"] = df["surface"].astype(str).str.strip() | |
df["kana"] = df["kana"].astype(str).str.strip() | |
df = df[df["kana"] != "*"] | |
df = df[df["surface"] != df["kana"]] | |
df = df[["surface", "kana"]] | |
df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False) | |
logger.info("✅ Processed unidic data!") | |
if __name__ == "__main__": | |
unidic_data() | |