File size: 2,941 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# config.py

import json
import logging.config
import sys
from pathlib import Path

from rich.logging import RichHandler

# Base and Config Directories
BASE_DIR = Path(__file__).parent.parent.absolute()
CONFIG_DIR = Path(BASE_DIR, "config")

# Data Directories
RAW_DATA_DIR = Path(BASE_DIR, "raw_data")
SENTENCE_DATA_DIR = Path(BASE_DIR, "sentence_data")
TRAIN_DATA_DIR = Path(SENTENCE_DATA_DIR, "train")
VAL_DATA_DIR = Path(SENTENCE_DATA_DIR, "val")
TEST_DATA_DIR = Path(SENTENCE_DATA_DIR, "test")
READING_DATA_DIR = Path(BASE_DIR, "reading_data")

# Logs Directory
LOGS_DIR = Path(BASE_DIR, "logs")

# Model Storage Directory
STORES_DIR = Path(BASE_DIR, "stores")
RUN_REGISTRY = Path(STORES_DIR, "runs")

# Create dirs
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
SENTENCE_DATA_DIR.mkdir(parents=True, exist_ok=True)
TRAIN_DATA_DIR.mkdir(parents=True, exist_ok=True)
VAL_DATA_DIR.mkdir(parents=True, exist_ok=True)
TEST_DATA_DIR.mkdir(parents=True, exist_ok=True)
READING_DATA_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)
STORES_DIR.mkdir(parents=True, exist_ok=True)
RUN_REGISTRY.mkdir(parents=True, exist_ok=True)

# Special tokens reserved
ASCII_SPACE_TOKEN = "\U0000FFFF"  # this is used to replace the usual space characters before sending text to mecab, because mecab uses the usual space to separate words.

# Seed
SEED = 1271297

# Training parameters
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15
assert TRAIN_SIZE + VAL_SIZE + TEST_SIZE == 1

# Heteronym list
with open(Path(CONFIG_DIR, "heteronyms.json")) as fp:
    HETERONYMS = json.load(fp)

# Logger
logging_config = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
        "minimal": {"format": "%(message)s"},
        "detailed": {
            "format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"
        },
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "stream": sys.stdout,
            "formatter": "minimal",
            "level": logging.DEBUG,
        },
        "info": {
            "class": "logging.handlers.RotatingFileHandler",
            "filename": Path(LOGS_DIR, "info.log"),
            "maxBytes": 10485760,  # 1 MB
            "backupCount": 10,
            "formatter": "detailed",
            "level": logging.INFO,
        },
        "error": {
            "class": "logging.handlers.RotatingFileHandler",
            "filename": Path(LOGS_DIR, "error.log"),
            "maxBytes": 10485760,  # 1 MB
            "backupCount": 10,
            "formatter": "detailed",
            "level": logging.ERROR,
        },
    },
    "root": {
        "handlers": ["console", "info", "error"],
        "level": logging.INFO,
        "propagate": True,
    },
}
logging.config.dictConfig(logging_config)
logger = logging.getLogger()
logger.handlers[0] = RichHandler(markup=True)