Source code for text_quality.settings

"""Global settings."""

import os
from pathlib import Path
from typing import Optional



[docs]
MINIMUM_PAGE_LENGTH: int = 5

"""Shorter texts are considered as empty."""


[docs]
EMPTY_PAGE_OUTPUT: Optional[int] = 0

"""Output value for empty pages.
If None, empty pages are handled through the standard pipeline."""


[docs]
SHORT_COLUMN_WIDTH: int = 5

"""If all lines (columns) in a page are shorter than this it is considered broken."""


[docs]
ENCODING = "utf-8"

"""Encoding to be used throughout all text file processing operations."""


[docs]
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")



[docs]
LINE_SEPARATOR = os.getenv("LINE_SEPARATOR", "\n")



[docs]
Q_GRAM_LENGTH: int = int(os.environ.get("Q_GRAM_LENGTH", "3"))


[docs]
Q_GRAMS_GAMMA: int = int(os.environ.get("Q_GRAMS_GAMMA", "1000"))



[docs]
SOURCE_DIR = Path(__file__).parent


[docs]
DATA_DIR = SOURCE_DIR / "data"



[docs]
DICTS_DIR = DATA_DIR / "dicts"


[docs]
HUNSPELL_DIR = DICTS_DIR / "hunspell"



[docs]
QGRAMS_DIR = DATA_DIR / "qgrams"



[docs]
CLASSIFIER_DIR = DATA_DIR / "classifier"


for directory in (DATA_DIR, DICTS_DIR, HUNSPELL_DIR, CLASSIFIER_DIR):
    if not directory.is_dir():
        raise NotADirectoryError(directory)


### INITIALIZE

[docs]
DEFAULT_LANGUAGE = "nl"



[docs]
HUNSPELL_LANGUAGE = DEFAULT_LANGUAGE


[docs]
TOKEN_DICT_FILE: Path = DICTS_DIR / "nl_voc.txt"


[docs]
QGRAMS_FILE: Path = QGRAMS_DIR / "nl_voc.txt"


[docs]
PIPELINE_FILE: Path = CLASSIFIER_DIR / "pipeline_nn.joblib"


for file in (TOKEN_DICT_FILE, QGRAMS_FILE, PIPELINE_FILE):
    if not file.is_file():
        raise FileNotFoundError(file)