Source code for text_quality.feature.featurize

from typing import List
from typing import TypedDict
import pandas as pd
from .scorer.dictionary import HunspellDictionary
from .scorer.dictionary import TokenDictionary
from .scorer.garbage import GarbageDetector
from .scorer.q_gram import QGram
from .tokenizer import Tokenizer



[docs]
class Scorers(TypedDict):
    """A configuration of features and respective Scorers"""


[docs]
    dict_score: HunspellDictionary


[docs]
    dict_score_gt: TokenDictionary


[docs]
    n_gram_score: QGram


[docs]
    garbage_score: GarbageDetector





[docs]
class Featurizer:
    """A collection of scorers to featurize an input text."""

    def __init__(self, scorers: Scorers, tokenizer: Tokenizer) -> None:
        self._scorers = scorers
        self._tokenizer = tokenizer

    @property

[docs]
    def features(self) -> List[str]:
        return list(self._scorers.keys())



[docs]
    def featurize(self, text: str) -> tuple[dict[str, float], List[str]]:
        tokens = self._tokenizer.tokenize(text)
        return {
            feature: scorer.score(tokens) for feature, scorer in self._scorers.items()
        }, tokens



[docs]
    def featurize_as_dataframe(self, text: str) -> tuple[pd.DataFrame, List[str]]:
        features, tokens = self.featurize(text)
        return Featurizer.as_dataframe(features), tokens


    @staticmethod

[docs]
    def as_dataframe(features: dict[str, float]) -> pd.DataFrame:
        return pd.DataFrame({feature: [value] for feature, value in features.items()})