Source code for text_quality.feature.scorer.dictionary

import logging
from abc import abstractmethod
from pathlib import Path
from typing import List
from spylls import hunspell
from ...settings import ENCODING
from ...settings import LINE_SEPARATOR
from .scorer import Scorer


[docs] class Dictionary(Scorer): def __init__(self, dictionary) -> None: self._dictionary = dictionary @abstractmethod
[docs] def _lookup(self, token: str) -> bool: return NotImplemented
[docs] def score(self, tokens: List[str]) -> float: """ `See Nautilus-OCR <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L129>`_ """ if not any(len(token) > 0 for token in tokens): # empty input return 0.0 matched_count = 0 total_count = 0 for token in tokens: total_count += len(token) # TODO: lowercase token? matched_count += self._lookup(token) * len(token) return matched_count / total_count
[docs] class TokenDictionary(Dictionary): def __init__(self, dictionary) -> None: super().__init__(set(dictionary))
[docs] def _lookup(self, token: str) -> bool: return token in self._dictionary
[docs] def to_file(self, filepath: Path, sort: bool = True, overwrite: bool = False): if filepath.exists() and not overwrite: raise FileExistsError(filepath) tokens = sorted(self._dictionary) if sort else self._dictionary logging.info("Writing %d tokens to file '%s'.", len(tokens), filepath) with open(filepath, "wt", encoding=ENCODING) as f: f.write(LINE_SEPARATOR.join(tokens))
@classmethod
[docs] def from_file(cls, filepath: Path): logging.info("Reading token dictionary from file '%s'.", str(filepath)) with open(filepath, "rt", encoding=ENCODING) as f: tokens = [line.strip() for line in f if not line.strip().startswith("#")] return cls(tokens)
[docs] class HunspellDictionary(Dictionary):
[docs] def _lookup(self, token: str) -> bool: return len(token.strip()) > 0 and self._dictionary.lookup(token)
@classmethod
[docs] def from_path(cls, path: Path, language: str) -> "HunspellDictionary": logging.info( "Reading Hunspell dictionary '%s' in directory '%s'", language, str(path) ) return cls(hunspell.Dictionary.from_files(str(path / language)))