Source code for text_quality.feature.featurize

from typing import List
from typing import TypedDict
import pandas as pd
from .scorer.dictionary import HunspellDictionary
from .scorer.dictionary import TokenDictionary
from .scorer.garbage import GarbageDetector
from .scorer.q_gram import QGram
from .tokenizer import Tokenizer


[docs] class Scorers(TypedDict): """A configuration of features and respective Scorers"""
[docs] dict_score: HunspellDictionary
[docs] dict_score_gt: TokenDictionary
[docs] n_gram_score: QGram
[docs] garbage_score: GarbageDetector
[docs] class Featurizer: """A collection of scorers to featurize an input text.""" def __init__(self, scorers: Scorers, tokenizer: Tokenizer) -> None: self._scorers = scorers self._tokenizer = tokenizer @property
[docs] def features(self) -> List[str]: return list(self._scorers.keys())
[docs] def featurize(self, text: str) -> tuple[dict[str, float], List[str]]: tokens = self._tokenizer.tokenize(text) return { feature: scorer.score(tokens) for feature, scorer in self._scorers.items() }, tokens
[docs] def featurize_as_dataframe(self, text: str) -> tuple[pd.DataFrame, List[str]]: features, tokens = self.featurize(text) return Featurizer.as_dataframe(features), tokens
@staticmethod
[docs] def as_dataframe(features: dict[str, float]) -> pd.DataFrame: return pd.DataFrame({feature: [value] for feature, value in features.items()})