Spaces:
Build error
Build error
File size: 1,158 Bytes
9aba307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
"""unidic.py
Data processing script for unidic dictionary
"""
import warnings
from pathlib import Path
import pandas as pd
from config import config
from config.config import logger
warnings.filterwarnings("ignore")
def unidic_data():
"""Extract, load and transform the unidic data"""
# Extract sentences from the data files
unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0]
# Load file
df = pd.read_csv(
unidic_file,
header=None,
names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType "
"cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType "
"fForm iConType fConType type kana kanaBase form formBase aType aConType "
"aModType lid lemma_id".split(" "),
)
df["surface"] = df["surface"].astype(str).str.strip()
df["kana"] = df["kana"].astype(str).str.strip()
df = df[df["kana"] != "*"]
df = df[df["surface"] != df["kana"]]
df = df[["surface", "kana"]]
df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False)
logger.info("✅ Processed unidic data!")
if __name__ == "__main__":
unidic_data()
|