Source code for nlpretext.token.preprocess

# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License


from typing import List, Optional

import re

from nlpretext._utils.stopwords import get_stopwords


[docs]def remove_stopwords( tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None ) -> List[str]: """ Remove stopwords from a text. eg. 'I like when you move your body !' -> 'I move body !'. Parameters ---------- tokens: list(str) list of tokens lang: str language iso code (e.g : "en") custom_stopwords : list(str)|None list of custom stopwords to add. None by default Returns ------- list tokens without stopwords Raises ------ ValueError When inputs is not a list """ stopwords = get_stopwords(lang) if custom_stopwords: stopwords += custom_stopwords tokens = [word for word in tokens if word not in stopwords] return tokens
[docs]def remove_tokens_with_nonletters(tokens: List[str]) -> List[str]: """ Inputs a list of tokens, outputs a list of tokens without tokens that includes numbers of special caracters. ['foo','bar','124','34euros'] -> ['foo','bar']. Parameters ---------- tokens : list list of tokens to be cleaned Returns ------- list list of tokens without tokens with numbers """ tokens = [word for word in tokens if re.search("[^a-zA-Z]", word) is None] return tokens
[docs]def remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]: """ Remove tokens that doesn't contains any number or letter. eg. ['foo','bar','---',"'s",'#'] -> ['foo','bar',"'s"]. Parameters ---------- tokens : list list of tokens to be cleaned Returns ------- list list of tokens without tokens that contains only special caracters """ tokens = [word for word in tokens if re.search("[a-zA-Z0-9]", word)] return tokens
[docs]def remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]: """ Function that removes words which length is below a threshold ["hello", "my", "name", "is", "John", "Doe"] --> ["hello","name","John","Doe"]. Parameters ---------- text : list list of strings smallwords_threshold: int threshold of small word Returns ------- list """ tokens = [word for word in tokens if len(word) > smallwords_threshold] return tokens