Python Examples of emoji.UNICODE

Source File: emojireact.py From Trusty-cogs-archive with MIT License

6 votes

def on_message(self, message):
        channel = message.channel
        if message.server.id not in self.settings:
            return
        if not self.settings[message.server.id]:
            return
        emoji_list = []
        for word in message.content.split(" "):
            if word.startswith("<:") and word.endswith(">"):
                emoji_list.append(word.rpartition(">")[0].partition("<")[2])
            if word in UNICODE_EMOJI:
                emoji_list.append(word)
        if emoji_list == []:
            return
        for emoji in emoji_list:
            try:
                await self.bot.add_reaction(message, emoji)
            except:
                pass

Source File: filter_utils.py From neural_chat with MIT License

6 votes

def separate_emojis_and_text(text):
    emoji_chars = []
    non_emoji_chars = []
    for c in text:
        if c in emoji.UNICODE_EMOJI:
            emoji_chars.append(c)
        else:
            non_emoji_chars.append(c)
    return ''.join(emoji_chars), ''.join(non_emoji_chars)

Source File: png.py From wttr.in with Apache License 2.0

6 votes

def _script_category(char):
    """Returns category of a Unicode character

    Possible values:
        default, Cyrillic, Greek, Han, Hiragana
    """

    if char in emoji.UNICODE_EMOJI:
        return "Emoji"

    cat = unicodedata2.script_cat(char)[0]
    if char == u'：':
        return 'Han'
    if cat in ['Latin', 'Common']:
        return 'default'
    return cat

Source File: filter_input.py From neural_chat with MIT License

5 votes

def read_english(path="english_words.txt", add_emojis=True):
    # read english words for filtering (includes emojis as part of set)
    english = set()
    with codecs.open(path, "r", "utf-8") as f:
        for line in f:
            line = line.strip().lower().replace('\n', '')
            if len(line):
                english.add(line)
    if add_emojis:
        for e in UNICODE_EMOJI:
            english.add(e)
    return english

Source File: preprocess.py From project-purifier with Apache License 2.0

5 votes

def preprocess_string(text):
    """
    입력받은 text 를 전처리 하는 함수.

    :param text: str
    :return : str

    """

    # 이모티콘부터 제거
    no_emoticon = ''
    for char in text:
        if char not in emoji.UNICODE_EMOJI:
            no_emoticon += char

    # 특수문자 기준 split
    no_punctuation = re.split(r'([!,?]+)|([.]+)|([,]+)|(["])|([\'])|([&]+)|([(]+)|([)]+)|([~]+)|([♡]+)|([☆,★]+)',
                              no_emoticon.strip())
    no_punctuation_text = []

    for string in no_punctuation:
        if (string == '') or (string is None): continue
        no_punctuation_text.append(string)

    no_punctuation_text = ' '.join(no_punctuation_text)

    # 단독으로 쓰인 자모음 분리
    split_char = re.split(r'([ㄱ-ㅣ0-9]+)', no_punctuation_text.strip())
    split_char = ' '.join(split_char)

    # 한국어에서 단독으로 자주 쓰이는 자모음 뭉치 분리
    split_char = re.split(r'([ㅎ]{2,})|([ㅜ,ㅠ]{2,})|([ㅗ]+)|([ㅋ,ㄱ,ㄲ]{2,})|\s+', split_char.strip())
    final_text = []
    for string in split_char:
        if (string == '') or (string is None): continue
        final_text.append(string)

    return ' '.join(final_text)

Source File: chatline.py From WhatsApp-Analyzer with MIT License

5 votes

def extract_emojis(self, string=""):
        emj = []
        for c in string:
            if c in emoji.UNICODE_EMOJI:
                emj.append(c)
        return emj

Source File: filter_utils.py From ELSA with MIT License

5 votes

def separate_emojis_and_text(text):
    emoji_chars = []
    non_emoji_chars = []
    for c in text:
        if c in emoji.UNICODE_EMOJI:
            emoji_chars.append(c)
        else:
            non_emoji_chars.append(c)
    return ''.join(emoji_chars), ''.join(non_emoji_chars)

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def check_if_emoji(word, emoji_dict):
    emojis = list(word)
    for em in emojis:
        if em in emoji_dict.keys() or em in emoji.UNICODE_EMOJI:
            return True
    return False


# A strict clean of the twitter data - removing emojis, hashtags, URLs, user mentions

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def clean_tweet(tweet, word_list, split_hashtag_method, replace_user_mentions=True,
                remove_hashtags=False, remove_emojis=False, all_to_lower_case=False):
    # Add white space before every punctuation sign so that we can split around it and keep it
    tweet = re.sub('([!?*&%"~`^+{}])', r' \1 ', tweet)
    tweet = re.sub('\s{2,}', ' ', tweet)
    tokens = tweet.split()
    valid_tokens = []
    for word in tokens:
        # Never include #sarca* hashtags
        if word.lower().startswith('#sarca'):
            continue
        # Never include URLs
        if 'http' in word:
            continue
        # Replace specific user mentions with a general user name
        if replace_user_mentions and word.startswith('@'):
            word = '@user'
        # Split or remove hashtags
        if word.startswith('#'):
            if remove_hashtags:
                continue
            splits = split_hashtag_method(word[1:], word_list)
            if all_to_lower_case:
                valid_tokens.extend([split.lower() for split in splits])
            else:
                valid_tokens.extend(splits)
            continue
        if remove_emojis and word in emoji.UNICODE_EMOJI:
            continue
        if all_to_lower_case:
            word = word.lower()
        valid_tokens.append(word)
    return ' '.join(valid_tokens)

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def process_emojis(word, emoji_dict, translate_emojis=True):
    processed = []
    chars = list(word)
    remaining = ""
    for c in chars:
        if c in emoji_dict.keys() or c in emoji.UNICODE_EMOJI:
            if remaining != "":
                processed.append(remaining)
                remaining = ""
            if translate_emojis:
                if c in emoji_dict:
                    processed.extend(emoji_dict[c][3].lower().split())
            else:
                processed.extend(c)
        else:
            remaining += c
    if remaining != "":
        processed.append(remaining)
    if processed != []:
        return ' '.join(processed)
    else:
        return word


# TODO: Numerals - sarcasm heavily relies on them so find a way to extract meaning behind numbers
# Attempt to clean each tweet and make it as grammatical as possible

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def extract_emojis(tweets):
    emojis = []
    for tw in tweets:
        tw_emojis = []
        for word in tw:
            chars = list(word)
            for ch in chars:
                if ch in emoji.UNICODE_EMOJI:
                    tw_emojis.append(ch)
        emojis.append(' '.join(tw_emojis))
    return emojis


# Replace a contraction (coming from possessives, verbs, emphasis or just bad language) by its longer form

Source File: extract_ml_features.py From Sarcasm-Detection with MIT License

5 votes

def get_pragmatic_features(tweet_tokens):
    capitalized_words = user_specific = intensifiers = tweet_len_ch = 0
    for t in tweet_tokens:
        tweet_len_ch += len(t)
        if t.isupper() and len(t) > 1:
            capitalized_words += 1       # count of capitalized words
        if t.startswith("@"):
            user_specific += 1          # count of user mentions
        if t.startswith("#"):
            user_specific += 1          # count-based feature of hashtags used (excluding sarcasm or sarcastic)
        if t.lower().startswith("haha") or re.match('l(o)+l$', t.lower()):
            user_specific += 1          # binary feature marking the presence of laughter
        if t in helper.strong_negations:
            intensifiers += 1           # count-based feature of strong negations
        if t in helper.strong_affirmatives:
            intensifiers += 1           # count-based feature of strong affirmatives
        if t in helper.interjections:
            intensifiers += 1           # count-based feature of relevant interjections
        if t in helper.intensifiers:
            intensifiers += 1           # count-based feature of relevant intensifiers
        if t in helper.punctuation:
            user_specific += 1          # count-based feature of relevant punctuation signs
        if t in emoji.UNICODE_EMOJI:
            user_specific += 1          # count-based feature of emojis
    tweet_len_tokens = len(tweet_tokens)  # get the length of the tweet in tokens
    average_token_length = float(tweet_len_tokens) / max(1.0, float(tweet_len_ch))  # average tweet length
    feature_list = {'tw_len_ch': tweet_len_ch, 'tw_len_tok': tweet_len_tokens, 'avg_len': average_token_length,
                    'capitalized': capitalized_words, 'user_specific': user_specific, 'intensifiers': intensifiers}
    return feature_list


# Extract the n-grams (specified as a list n = [1, 2, 3, ...])
# e.g if n = [1,2,3] then n-gram_features is a dictionary of all uni-grams, bi-grams and tri-grams
# This n-gram extractor works for any kind of tokens i.e both words and pos tags

Source File: utils.py From fontObfuscator with MIT License

5 votes

def str_has_emoji(s: str) -> bool:
    for character in s:
        if character in emoji.UNICODE_EMOJI:
            return True
    return False

Source File: candidate_data_fetcher.py From BLINK with MIT License

5 votes

def get_data_for_entity(self, entity_data):
        """Given an entity data dictionary that contains some linking data (ex. title or ID), additional information (ex. description, aliases etc.) is added to the given entity dictionary"""
        data = self.data
        title = entity_data["wikipedia_title"]

        if "wikidata_info" in data[title]:
            if ("aliases" in data[title]["wikidata_info"]) and (
                data[title]["wikidata_info"]["aliases"]
            ) is not None:
                aliases = [
                    alias
                    for alias in data[title]["wikidata_info"]["aliases"]
                    if alias not in emoji.UNICODE_EMOJI
                ]
            else:
                aliases = None
        else:
            aliases = None

        entity_data["aliases"] = aliases

        sents = []

        for k in range(0, 10):
            key = "sent_desc_{}".format(k + 1)
            sents.append(data[title].get(key, ""))

        entity_data["sentences"] = sents

        return entity_data

Source File: structure_tools.py From message-analyser with MIT License

5 votes

def get_emoji_countered(msgs):
    """Counts all emojis in messages.

    Args:
        msgs (list of MyMessage objects): Messages.

    Returns:
        collections.Counter of emojis.
    """
    cnt = Counter()
    for msg in msgs:
        for character in msg.text:
            if character in emoji.UNICODE_EMOJI:
                cnt[character] += 1
    return cnt

Source File: bot.py From modmail with GNU Affero General Public License v3.0

5 votes

def convert_emoji(self, name: str) -> str:
        ctx = SimpleNamespace(bot=self, guild=self.modmail_guild)
        converter = commands.EmojiConverter()

        if name not in UNICODE_EMOJI:
            try:
                name = await converter.convert(ctx, name.strip(":"))
            except commands.BadArgument as e:
                logger.warning("%s is not a valid emoji. %s.", e)
                raise
        return name

Source File: filter_input.py From DeepMoji with MIT License

5 votes

def read_english(path="english_words.txt", add_emojis=True):
    # read english words for filtering (includes emojis as part of set)
    english = set()
    with codecs.open(path, "r", "utf-8") as f:
        for line in f:
            line = line.strip().lower().replace('\n', '')
            if len(line):
                english.add(line)
    if add_emojis:
        for e in UNICODE_EMOJI:
            english.add(e)
    return english

Source File: filter_utils.py From DeepMoji with MIT License

5 votes

def separate_emojis_and_text(text):
    emoji_chars = []
    non_emoji_chars = []
    for c in text:
        if c in emoji.UNICODE_EMOJI:
            emoji_chars.append(c)
        else:
            non_emoji_chars.append(c)
    return ''.join(emoji_chars), ''.join(non_emoji_chars)

Source File: get_tweet.py From Dialog with MIT License

5 votes

def screening(text):
    s = text

    if s[0:3] == "RT ":
        s = s.replace(s[0:3], "")
    while s.find("@") != -1:
        index_at = s.find("@")
        if s.find(" ") != -1:
            index_sp = s.find(" ", index_at)
            if index_sp != -1:
                s = s.replace(s[index_at:index_sp + 1], "")
            else:
                s = s.replace(s[index_at:], "")
        else:
            s = s.replace(s[index_at:], "")

    while s.find("\n") != -1:
        index_ret = s.find("\n")
        s = s.replace(s[index_ret], "")
    s = s.replace('\n', '')

    s = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", s)
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), '')
    s = s.translate(non_bmp_map)
    s = ''.join(c if c not in emoji.UNICODE_EMOJI else '' for c in s)

    s = re.sub('。+', '。', s)

    while s.find('#') != -1:
        index_hash = s.find('#')
        s = s[0:index_hash]

    s = neologdn.normalize(s, repeat=4)

    s = re.sub(r'[^、。!?ー〜1-9a-zA-Zぁ-んァ-ヶ亜-腕纊-黑一-鿕]', '', s)

    return s

Source File: data_ingestion.py From BLINK with MIT License

4 votes

def get_data_for_key(data, title):
    obj = {}

    obj["id"] = data[title]["wikipedia_id"]
    obj["title"] = title

    if ("wikidata_info" in data[title]) and (
        data[title]["wikidata_info"]["wikidata_id"] is not None
    ):
        obj["wikidata_id"] = data[title]["wikidata_info"]["wikidata_id"]
    else:
        obj["wikidata_id"] = data[title]["wikidata_id_from_index"]

    description = data[title]["intro_concatenated"]
    obj["desc"] = description

    if "wikidata_info" in data[title]:
        if "description" in data[title]["wikidata_info"]:
            wikidata_description = data[title]["wikidata_info"]["description"]
        else:
            wikidata_description = ""

        if ("aliases" in data[title]["wikidata_info"]) and (
            data[title]["wikidata_info"]["aliases"]
        ) is not None:
            aliases = " ".join(
                [
                    '"{}"'.format(alias)
                    for alias in data[title]["wikidata_info"]["aliases"]
                    if alias not in emoji.UNICODE_EMOJI
                ]
            )
        else:
            aliases = ""
    else:
        aliases = ""
        wikidata_description = ""

    obj["aliases"] = aliases
    obj["wikidata_desc"] = wikidata_description
    obj["num_tokens"] = data[title]["num_tokens"]
    obj["num_incoming_links"] = data[title].get("num_incoming_links", 0)

    if args.add_sentence_data:
        for k in range(0, 10):
            key = "sent_desc_{}".format(k + 1)
            obj[key] = data[title].get(key, "")

    return obj

Python emoji.UNICODE_EMOJI Examples