Source code for text_quality.feature.tokenizer

from abc import ABC
from abc import abstractmethod
from typing import List


[docs] class Tokenizer(ABC): @abstractmethod
[docs] def tokenize(self, text: str) -> List[str]: return NotImplemented
[docs] class NautilusOcrTokenizer(Tokenizer):
[docs] _HYPHENS = {"-", "βΈ—", "="}
[docs] def tokenize(self, text: str) -> List[str]: """`Nautilus-OCR tokenizer <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L84>`_""" tokens = [] new_token = "" for c in text: if c == " " and len(new_token) > 0: tokens.append(new_token) new_token = "" elif c == "\n" and len(new_token) > 0: if new_token[-1] in self._HYPHENS: new_token = new_token[:-1] else: tokens.append(new_token) new_token = "" else: new_token += c if len(new_token) > 0: tokens.append(new_token) for i, token in enumerate(tokens): if not token[-1].isalpha(): tokens[i] = token[:-1] if not token[0].isalpha(): tokens[i] = token[1:] return tokens