Source code for text_quality.feature.scorer.dictionary

import logging
from abc import abstractmethod
from pathlib import Path
from typing import List
from spylls import hunspell
from ...settings import ENCODING
from ...settings import LINE_SEPARATOR
from .scorer import Scorer



[docs]
class Dictionary(Scorer):
    def __init__(self, dictionary) -> None:
        self._dictionary = dictionary

    @abstractmethod

[docs]
    def _lookup(self, token: str) -> bool:
        return NotImplemented



[docs]
    def score(self, tokens: List[str]) -> float:
        """
        `See Nautilus-OCR <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L129>`_
        """
        if not any(len(token) > 0 for token in tokens):
            # empty input
            return 0.0

        matched_count = 0
        total_count = 0

        for token in tokens:
            total_count += len(token)

            # TODO: lowercase token?
            matched_count += self._lookup(token) * len(token)

        return matched_count / total_count





[docs]
class TokenDictionary(Dictionary):
    def __init__(self, dictionary) -> None:
        super().__init__(set(dictionary))


[docs]
    def _lookup(self, token: str) -> bool:
        return token in self._dictionary



[docs]
    def to_file(self, filepath: Path, sort: bool = True, overwrite: bool = False):
        if filepath.exists() and not overwrite:
            raise FileExistsError(filepath)

        tokens = sorted(self._dictionary) if sort else self._dictionary
        logging.info("Writing %d tokens to file '%s'.", len(tokens), filepath)

        with open(filepath, "wt", encoding=ENCODING) as f:
            f.write(LINE_SEPARATOR.join(tokens))


    @classmethod

[docs]
    def from_file(cls, filepath: Path):
        logging.info("Reading token dictionary from file '%s'.", str(filepath))
        with open(filepath, "rt", encoding=ENCODING) as f:
            tokens = [line.strip() for line in f if not line.strip().startswith("#")]
        return cls(tokens)





[docs]
class HunspellDictionary(Dictionary):

[docs]
    def _lookup(self, token: str) -> bool:
        return len(token.strip()) > 0 and self._dictionary.lookup(token)


    @classmethod

[docs]
    def from_path(cls, path: Path, language: str) -> "HunspellDictionary":
        logging.info(
            "Reading Hunspell dictionary '%s' in directory '%s'", language, str(path)
        )
        return cls(hunspell.Dictionary.from_files(str(path / language)))