"""Global settings."""
import os
from pathlib import Path
from typing import Optional
[docs]
MINIMUM_PAGE_LENGTH: int = 5
"""Shorter texts are considered as empty."""
[docs]
EMPTY_PAGE_OUTPUT: Optional[int] = 0
"""Output value for empty pages.
If None, empty pages are handled through the standard pipeline."""
[docs]
SHORT_COLUMN_WIDTH: int = 5
"""If all lines (columns) in a page are shorter than this it is considered broken."""
"""Encoding to be used throughout all text file processing operations."""
[docs]
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
[docs]
LINE_SEPARATOR = os.getenv("LINE_SEPARATOR", "\n")
[docs]
Q_GRAM_LENGTH: int = int(os.environ.get("Q_GRAM_LENGTH", "3"))
[docs]
Q_GRAMS_GAMMA: int = int(os.environ.get("Q_GRAMS_GAMMA", "1000"))
[docs]
SOURCE_DIR = Path(__file__).parent
[docs]
DATA_DIR = SOURCE_DIR / "data"
[docs]
DICTS_DIR = DATA_DIR / "dicts"
[docs]
HUNSPELL_DIR = DICTS_DIR / "hunspell"
[docs]
QGRAMS_DIR = DATA_DIR / "qgrams"
[docs]
CLASSIFIER_DIR = DATA_DIR / "classifier"
for directory in (DATA_DIR, DICTS_DIR, HUNSPELL_DIR, CLASSIFIER_DIR):
if not directory.is_dir():
raise NotADirectoryError(directory)
### INITIALIZE
[docs]
DEFAULT_LANGUAGE = "nl"
[docs]
HUNSPELL_LANGUAGE = DEFAULT_LANGUAGE
[docs]
TOKEN_DICT_FILE: Path = DICTS_DIR / "nl_voc.txt"
[docs]
QGRAMS_FILE: Path = QGRAMS_DIR / "nl_voc.txt"
[docs]
PIPELINE_FILE: Path = CLASSIFIER_DIR / "pipeline_nn.joblib"
for file in (TOKEN_DICT_FILE, QGRAMS_FILE, PIPELINE_FILE):
if not file.is_file():
raise FileNotFoundError(file)