Source code for nlpretext.preprocessor

from typing import Any, Callable, Dict, List, Optional

from nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters
from nlpretext.social.preprocess import (
    remove_emoji,
    remove_hashtag,
    remove_html_tags,
    remove_mentions,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


[docs]class Preprocessor: def __init__(self): """Initialize preprocessor object to apply all text transformation.""" self.__operations = [] self.pipeline = None
[docs] def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None: """ Add an operation and its arguments to pipe in the preprocessor. Parameters ---------- operation : callable text preprocessing function args : dict of arguments """ self.__operations.append({"operation": operation, "args": args})
[docs] @staticmethod def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline: """ Build sklearn pipeline from a operation list. Parameters ---------- operation_list : iterable list of __operations of preprocessing Returns ------- sklearn.pipeline.Pipeline """ return Pipeline( steps=[ ( operation["operation"].__name__, FunctionTransformer(operation["operation"], kw_args=operation["args"]), ) for operation in operation_list ] )
[docs] def run(self, text: str) -> str: """ Apply pipeline to text. Parameters ---------- text : string text to preprocess Returns ------- string """ operations = self.__operations if operations == []: operations_to_pipe = ( remove_html_tags, remove_mentions, remove_emoji, remove_hashtag, remove_eol_characters, fix_bad_unicode, normalize_whitespace, ) operations = [ {"operation": operation, "args": None} for operation in operations_to_pipe ] self.pipeline = self.build_pipeline(operations) text = self.pipeline.transform(text) return text