Source code for text_quality.feature.scorer.garbage

from typing import List
from .scorer import Scorer


[docs] class GarbageDetector(Scorer):
[docs] _VOWELS = "aäàáâǎeéèêëěiîïíìıoöôòóǒuüûùúǔ"
[docs] EPR_RULE1 = 21
[docs] EPR_RULE2 = 3
[docs] EPR_RULE3 = 4
[docs] EPR_RULE4 = 6
[docs] EPR_RULE5 = 8
[docs] EPR_RULE9 = 2
[docs] def score(self, tokens: List[str]) -> float: # noqa: MC0001 """ `See Nautilus-OCR <https://github.com/natliblux/nautilusocr/blob/2d4d59c45466b5cc8c9897798bd8b205a7f0c02c/src/epr/features_epr.py#L148>`_ """ # pylint: disable=consider-using-enumerate,too-many-branches,too-many-locals,too-many-statements,chained-comparison issues = 0 if len(tokens) == 0: return 0 for token in tokens: # rule1 if len(token) >= GarbageDetector.EPR_RULE1: issues += 1 continue vowel_count = 0 consonant_count = 0 lower_case_count = 0 upper_case_count = 0 special_char_count = 0 non_outer_special_chars = set() alpha = True last_char = None repitition_streak = 0 vowel_streak = 0 consonant_streak = 0 go_to_next_token = False for i in range(0, len(token)): go_to_next_token = False char = token[i] # collect token info if char.isalpha(): if char.lower() in GarbageDetector._VOWELS: vowel_count += 1 vowel_streak += 1 consonant_streak = 0 else: consonant_count += 1 consonant_streak += 1 vowel_streak = 0 if char.isupper(): upper_case_count += 1 else: lower_case_count += 1 elif char.isalnum(): alpha = False vowel_streak = 0 consonant_streak = 0 else: special_char_count += 1 alpha = False vowel_streak = 0 consonant_streak = 0 # pylint: disable=consider-using-in if i != 0 and i != len(token) - 1: non_outer_special_chars.add(char) # rule 3 if vowel_streak >= GarbageDetector.EPR_RULE3: issues += 1 go_to_next_token = True break # rule 4 if consonant_streak >= GarbageDetector.EPR_RULE4: issues += 1 go_to_next_token = True break if last_char is not None and char == last_char: repitition_streak += 1 # rule 2 if repitition_streak >= GarbageDetector.EPR_RULE2: issues += 1 go_to_next_token = True break else: repitition_streak = 0 last_char = char if go_to_next_token: continue if alpha and vowel_count > 0 and consonant_count > 0: # rule 5 if vowel_count * GarbageDetector.EPR_RULE5 < consonant_count: issues += 1 continue # rule 5 if consonant_count * GarbageDetector.EPR_RULE5 < vowel_count: issues += 1 continue # rule 6 if lower_case_count > 0 and upper_case_count > lower_case_count: issues += 1 continue # rule 7 if ( upper_case_count > 0 and token[0].islower() and token[len(token) - 1].islower() ): issues += 1 continue # rule 8 regular_chars = len(token) - special_char_count if special_char_count >= regular_chars and regular_chars > 0: issues += 1 continue # rule 9 if len(non_outer_special_chars) >= GarbageDetector.EPR_RULE9: issues += 1 continue return issues / len(tokens)