Sam Passaglia
initial commit
9aba307
raw
history blame
1.16 kB
"""unidic.py
Data processing script for unidic dictionary
"""
import warnings
from pathlib import Path
import pandas as pd
from config import config
from config.config import logger
warnings.filterwarnings("ignore")
def unidic_data():
"""Extract, load and transform the unidic data"""
# Extract sentences from the data files
unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0]
# Load file
df = pd.read_csv(
unidic_file,
header=None,
names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType "
"cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType "
"fForm iConType fConType type kana kanaBase form formBase aType aConType "
"aModType lid lemma_id".split(" "),
)
df["surface"] = df["surface"].astype(str).str.strip()
df["kana"] = df["kana"].astype(str).str.strip()
df = df[df["kana"] != "*"]
df = df[df["surface"] != df["kana"]]
df = df[["surface", "kana"]]
df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False)
logger.info("✅ Processed unidic data!")
if __name__ == "__main__":
unidic_data()