Source code for text_quality.feature.scorer.q_gram

import logging
from functools import lru_cache
from pathlib import Path
from typing import List
from typing import Optional
from ...settings import ENCODING
from ...settings import LINE_SEPARATOR
from ...settings import Q_GRAM_LENGTH
from ...settings import Q_GRAMS_GAMMA
from .scorer import Scorer


[docs] class QGram(Scorer): def __init__(self, qgrams: List[str]) -> None: self._lang_qgrams = qgrams self._qgram_set = set(qgrams) @lru_cache
[docs] def get_rank(self, qgram: str) -> Optional[int]: if qgram not in self._qgram_set: return None return self._lang_qgrams.index(qgram)
@lru_cache(maxsize=1024)
[docs] def _get_ngram_score(self, ngram: str) -> float: # pylint: disable=consider-using-enumerate for i in range(0, len(self._lang_qgrams)): if ngram == self._lang_qgrams[i]: return 1 - (1 / len(self._lang_qgrams) * i) raise AssertionError()
[docs] def _get_ngram_scores(self, ngrams: List[str]) -> float: """ `See Nautilus-OCR <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L51>`_ """ if len(ngrams) == 0: return 0 score = 0 for ngram in ngrams: if ngram in self._qgram_set: score += self._get_ngram_score(ngram) score = score / len(ngrams) return score
[docs] def score(self, tokens: List[str]) -> float: return self._get_ngram_scores(QGram._get_qgrams(tokens))
[docs] def to_file(self, filepath: Path): if filepath.exists(): raise FileExistsError(filepath) with open(filepath, "wt", encoding=ENCODING) as f: f.write(LINE_SEPARATOR.join(self._lang_qgrams))
@staticmethod
[docs] def _get_qgrams(tokens: List[str]): """ `See Nautilus-OCR <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L51>`_ """ q_grams = [] for token in tokens: token_list = list(token) # pylint: disable=consider-using-enumerate for i in range(0, len(token_list)): if not token[i].isalpha(): token_list[i] = " " modified_token = "".join(token_list) splits = modified_token.split(" ") for split in splits: if split != "": for i in range(0, len(split) - Q_GRAM_LENGTH + 1): q_grams.append(split[i : i + Q_GRAM_LENGTH].lower()) return q_grams
@classmethod
[docs] def from_file(cls, filepath: Path, gamma: int = Q_GRAMS_GAMMA): logging.info( "Reading character q-grams from file '%s', with gamma=%d.", str(filepath), gamma, ) q_grams = [] with open(filepath, "rt", encoding=ENCODING) as f: for line in f: q_grams.append(line.strip()) if gamma and len(q_grams) >= gamma: logging.info( "Stopping reading q-grams list, %d q-grams read.", len(q_grams) ) break return cls(q_grams)