Source code for nlpretext.token.tokenizer

# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# mypy: disable-error-code="assignment"

from typing import Any, List, Optional, Union

import os
import re

import nltk
import spacy
from sacremoses import MosesDetokenizer, MosesTokenizer

MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$")
SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"}


[docs]class LanguageNotHandled(Exception): pass
[docs]class LanguageNotInstalledError(Exception): pass
[docs]class SpacyModel:
[docs] class SingletonSpacyModel: def __init__(self, lang: str) -> None: self.lang = lang if lang == "en": self.model = _load_spacy_model("en_core_web_sm") elif lang == "fr": self.model = _load_spacy_model("fr_core_news_sm") elif lang == "ko": self.model = spacy.blank("ko") elif lang == "ja": self.model = spacy.blank("ja") else: raise (LanguageNotHandled("This spacy model is not available"))
model: Optional[spacy.language.Language] = None def __init__(self, lang): if not SpacyModel.model: SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model
[docs] def get_lang_model(self) -> Optional[str]: # noqa: D102 if self.model: lang: str = self.model.lang return lang return None
def _load_spacy_model(model: str) -> Any: try: return spacy.load(model) except OSError as e: if MODEL_REGEX.match(model): os.system(f"python -m spacy download {model}") # nosec return spacy.load(model) else: raise LanguageNotInstalledError( f"Model {model} is not installed. " f"To install, run: python -m spacy download {model}" ) from e def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]: """ Function that gets the right tokenizer given the language. Parameters ---------- lang : str Language in which text is written. Languages handled : ["en", "fr", "ko", "ja"] Returns ------- spacy.tokenizer.Tokenizer spacy tokenizer """ model = SpacyModel(lang).model if model: return model.tokenizer return None
[docs]def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]: """ Convert text to a list of tokens. Parameters ---------- lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'} choose the tokenization module according to the langage and the implementation. Recommanded: Spacy (faster, better results). To process other langages import models.Spacy_models Returns ------- list list of string Raises ------ ValueError If lang_module is not a valid module name """ if lang_module not in SUPPORTED_LANG_MODULES: raise ValueError( f"Invalid lang_module: {lang_module}. " f"lang_module must be one of {SUPPORTED_LANG_MODULES}." ) tokenized_words: List[str] = [] if "spacy" in lang_module: lang = lang_module.split("_")[0] spacymodel = _get_spacy_tokenizer(lang) if spacymodel: spacydoc = spacymodel(text) tokenized_words = [spacy_token.text for spacy_token in spacydoc] if lang_module == "en_nltk": tokenized_words = nltk.word_tokenize(text) if lang_module == "fr_moses": tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False) return tokenized_words
[docs]def untokenize(tokens: List[str], lang: str = "fr") -> str: """ Inputs a list of tokens output string. ["J'", 'ai'] >>> "J' ai". Parameters ---------- lang : string language code Returns ------- string text """ d = MosesDetokenizer(lang=lang) text: str = d.detokenize(tokens, unescape=False) return text
[docs]def convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str: # noqa: D103 if isinstance(tokens_or_str, str): return tokens_or_str if isinstance(tokens_or_str, list): return untokenize(tokens_or_str) if tokens_or_str is None: return "" raise TypeError("Please input string or tokens")
[docs]def convert_string_to_tokens( # noqa: D103 tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = "en_spacy" ) -> List[str]: if isinstance(tokens_or_str, str): return tokenize(tokens_or_str, lang_module=lang_module) if isinstance(tokens_or_str, list): return tokens_or_str if tokens_or_str is None: return [] raise TypeError("Please input string or tokens")