# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from typing import List, Tuple
import emoji as _emoji
from nlpretext._config import constants
from nlpretext.basic.preprocess import normalize_whitespace
[docs]def remove_mentions(text: str) -> str:
"""
Function that removes words preceded with a '@'.
Parameters
----------
text : str
Returns
-------
string
"""
text = normalize_whitespace(constants.AT_PATTERN.sub("", text))
return text
[docs]def remove_emoji(text: str) -> str:
"""
Remove emoji from any str by stripping any unicode in the range of Emoji unicode
as defined in the unicode convention:
http://www.unicode.org/emoji/charts/full-emoji-list.html.
Parameters
----------
text : str
Returns
-------
str
"""
text = _emoji.replace_emoji(text, "")
return text
# TODO: replace mutable default value :
# https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html
[docs]def convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (":", ":")) -> str:
"""
Convert emoji to their CLDR Short Name, according to the unicode convention
http://www.unicode.org/emoji/charts/full-emoji-list.html
eg. 😀 --> :grinning_face:
Parameters
----------
text : str
code_delimiters : tuple of symbols around the emoji code.
eg: (':',':') --> :grinning_face:
Returns
-------
str
string
"""
return _emoji.demojize(text, delimiters=code_delimiters)
[docs]def remove_hashtag(text: str) -> str:
"""
Function that removes words preceded with a '#'
eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin".
Parameters
----------
text : str
Returns
-------
str
text of a post without hashtags
"""
text = normalize_whitespace(constants.HASHTAG_PATTERN.sub("", text))
return text