Python Examples of nltk.tokenize.TweetTokenizer

Source File: twitter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def __init__(
        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
    ):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

Source File: guesswhat_tokenizer.py From guesswhat with Apache License 2.0

6 votes

def __init__(self, dictionary_file):
        with open(dictionary_file, 'r') as f:
            self.word2i = json.load(f)['word2i']
        self.wpt = TweetTokenizer(preserve_case=False)

        if "<stop_dialogue>" not in self.word2i:
            self.word2i["<stop_dialogue>"] = len(self.word2i)

        self.i2word = {}
        for (k, v) in self.word2i.items():
            self.i2word[v] = k

        # Retrieve key values
        self.no_words = len(self.word2i)
        self.start_token = self.word2i["<start>"]
        self.stop_token = self.word2i["?"]
        self.stop_dialogue = self.word2i["<stop_dialogue>"]
        self.padding_token = self.word2i["<padding>"]
        self.yes_token = self.word2i["<yes>"]
        self.no_token = self.word2i["<no>"]
        self.non_applicable_token = self.word2i["<n/a>"]

        self.answers = [self.yes_token, self.no_token, self.non_applicable_token]

Source File: twitter.py From razzy-spinner with GNU General Public License v3.0

6 votes

def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

6 votes

def get_ngram_features_from_map(tweets, ngram_map, n):
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    features = []
    for tweet in tweets:
        feature_list = [0] * np.zeros(len(ngram_map))
        tweet = tweet.lower()
        ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        if n > 1:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
        if n > 2:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
        for gram in ngram_list:
            if gram in ngram_map:
                feature_list[ngram_map[gram]] += 1.0
        features.append(feature_list)
    return features

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

5 votes

def get_ngrams(tweets, n):
    unigrams = Counter()
    bigrams = Counter()
    trigrams = Counter()
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    for tweet in tweets:
        tweet = tweet.lower()
        # Get the unigram list for this tweet and update the unigram counter
        unigram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        unigrams.update(unigram_list)
        # Get the bigram list for this tweet and update the bigram counter
        if n > 1:
            bigram_list = get_ngram_list(regexp_tknzr, tweet, 2)
            bigrams.update(bigram_list)
            # Get the trigram list for this tweet and update the trigram counter
            if n > 2:
                trigram_list = get_ngram_list(regexp_tknzr, tweet, 3)
                trigrams.update(trigram_list)
    # Update the counters such that each n-gram appears at least min_occurence times
    min_occurence = 2
    unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence]
    # In case using just unigrams, make the bigrams and trigrams empty
    bigram_tokens = trigram_tokens = []
    if n > 1:
        bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence]
    if n > 2:
        trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence]
    return unigram_tokens, bigram_tokens, trigram_tokens

Source File: model_vnc.py From Deep-Reinforcement-Learning-Hands-On with MIT License

5 votes

def __init__(self, max_dict_size=MM_MAX_DICT_SIZE, device="cpu"):
        self.max_dict_size = max_dict_size
        self.token_to_id = {TOKEN_UNK: 0}
        self.next_id = 1
        self.tokenizer = TweetTokenizer(preserve_case=True)
        self.device = device

Source File: utils.py From Deep-Reinforcement-Learning-Hands-On with MIT License

5 votes

def tokenize(s):
    return TweetTokenizer(preserve_case=False).tokenize(s)

Source File: clean_raw.py From leaf with BSD 2-Clause "Simplified" License

5 votes

def main():
	tknzr = TweetTokenizer()

	if not os.path.exists(FINAL_DIR):
		os.makedirs(FINAL_DIR)

	files = [f for f in os.listdir(DIR) if f.endswith('.pck')]
	files.sort()

	num_files = len(files)
	for i, f in enumerate(files):
		clean_file(f, tknzr)
		print('Done with {} of {}'.format(i, num_files))

Source File: reddit_utils.py From leaf with BSD 2-Clause "Simplified" License

5 votes

def clean_body(self, tknzr=None):
		if tknzr is None:
			tknzr = TweetTokenizer()

		# unescape html symbols.
		new_body = html.unescape(self.body)

		# remove extraneous whitespace.
		new_body = new_body.replace('\n', ' ')
		new_body = new_body.replace('\t', ' ')
		new_body = re.sub(r'\s+', ' ', new_body).strip()

		# remove non-ascii symbols.
		new_body = new_body.encode('ascii', errors='ignore').decode()

		# replace URLS with a special token.
		new_body = re.sub(URL_REGEX, URL_TOKEN, new_body)

		# replace reddit user with a token
		new_body = re.sub(USER_REGEX, USER_TOKEN, new_body)

		# replace subreddit names with a token
		new_body = re.sub(SUBREDDIT_REGEX, SUBREDDIT_TOKEN, new_body)

		# lowercase the text
		new_body = new_body.casefold()

		# Could be done in addition:
		# get rid of comments with quotes

		# tokenize the text
		new_body = tknzr.tokenize(new_body)

		self.body = ' '.join(new_body)

Source File: TrueCaser.py From truecase with Apache License 2.0

5 votes

def __init__(self, dist_file_path=None):
        """ Initialize module with default data/english.dist file """
        if dist_file_path is None:
            dist_file_path = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "data/english.dist")

        with open(dist_file_path, "rb") as distributions_file:
            pickle_dict = pickle.load(distributions_file)
            self.uni_dist = pickle_dict["uni_dist"]
            self.backward_bi_dist = pickle_dict["backward_bi_dist"]
            self.forward_bi_dist = pickle_dict["forward_bi_dist"]
            self.trigram_dist = pickle_dict["trigram_dist"]
            self.word_casing_lookup = pickle_dict["word_casing_lookup"]
        self.tknzr = TweetTokenizer()

Source File: util.py From topic-ensemble with Apache License 2.0

5 votes

def preprocess_tweets( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True):
	"""
	Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
	"""
	from nltk.tokenize import TweetTokenizer
	tweet_tokenizer = TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True)

	def custom_tokenizer( s ):
		# need to manually replace quotes
		s = s.replace("'"," ").replace('"',' ')
		tokens = []
		for x in tweet_tokenizer.tokenize(s):
			if len(x) >= min_term_length:
				if x[0] == "#" or x[0].isalpha():
					tokens.append( x )
		return tokens

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms)

# --------------------------------------------------------------

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

5 votes

def get_features2(tweets, subj_dict):
    print("Getting features type 2...")
    features = []
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
    lemmatizer = WordNetLemmatizer()
    for tweet in tweets:
        feature_list = [0.0] * 5
        tokens = tknzr.tokenize(tweet)
        # Take the number of positive and negative words as features
        for word in tokens:
            stemmed = lemmatizer.lemmatize(word, 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if stemmed in subj_dict:
                dictlist = []
                for word in subj_dict[stemmed]:
                    dictlist.extend(subj_dict[stemmed][word])
                if 'strongsubj' in dictlist:
                    value = 1.0
                else:
                    value = 0.5
                if 'positive' in dictlist:
                    feature_list[0] += value
                elif 'negative' in dictlist:
                    feature_list[1] += value
        # Take the report of positives to negatives as a feature
        if feature_list[0] != 0.0 and feature_list[1] != 0.0:
            feature_list[2] = feature_list[0] / feature_list[1]
        # Derive features from punctuation
        feature_list[2] += count_apparitions(tokens, helper.punctuation)
        # Take strong negations as a feature
        feature_list[3] += count_apparitions(tokens, helper.strong_negations)
        # Take strong affirmatives as a feature
        feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
        features.append(feature_list)
    print("Done.")
    return features

Source File: fetch_realtime_grounding.py From converse_reading_cmr with MIT License

5 votes

def __init__(self, max_fact_len=12, max_facts_count=500, min_fact_len=8):
		self.tokenizer = TweetTokenizer(preserve_case=False)
		self.extractor = pke.unsupervised.TopicRank()
		self.max_fact_len = max_fact_len
		self.max_facts_count = max_facts_count
		self.min_fact_len = min_fact_len

Source File: tokenizers.py From converse_reading_cmr with MIT License

5 votes

def clean_str(txt):
	#print("in=[%s]" % txt)
	txt = txt.lower()
	txt = re.sub('^',' ', txt)
	txt = re.sub('$',' ', txt)

	# url and tag
	words = []
	for word in txt.split():
		i = word.find('http') 
		if i >= 0:
			word = word[:i] + ' ' + '__url__'
		words.append(word.strip())
	txt = ' '.join(words)

	# remove markdown URL
	txt = re.sub(r'\[([^\]]*)\] \( *__url__ *\)', r'\1', txt)

	# remove illegal char
	txt = re.sub('__url__','URL',txt)
	txt = re.sub(r"[^A-Za-z0-9():,.!?\"\']", " ", txt)
	txt = re.sub('URL','__url__',txt)	

	# contraction
	add_space = ["'s", "'m", "'re", "n't", "'ll","'ve","'d","'em"]
	tokenizer = TweetTokenizer(preserve_case=False)
	txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '
	txt = txt.replace(" won't ", " will n't ")
	txt = txt.replace(" can't ", " can n't ")
	for a in add_space:
		txt = txt.replace(a+' ', ' '+a+' ')

	txt = re.sub(r'^\s+', '', txt)
	txt = re.sub(r'\s+$', '', txt)
	txt = re.sub(r'\s+', ' ', txt) # remove extra spaces
	
	#print("out=[%s]" % txt)
	return txt

Source File: parse_utils.py From deep-mlsa with Apache License 2.0

5 votes

def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer()

Source File: vqa_tokenizer.py From Conditional-Batch-Norm with MIT License

5 votes

def __init__(self, dictionary_file):

        self.tokenizer = TweetTokenizer(preserve_case=False)
        with open(dictionary_file, 'r') as f:
            data = json.load(f)
            self.word2i = data['word2i']
            self.answer2i = data['answer2i']
            self.preprocess_answers = data['preprocess_answers']

        self.dictionary_file = dictionary_file

        self.i2word = {}
        for (k, v) in self.word2i.items():
            self.i2word[v] = k

        self.i2answer = {}
        for (k, v) in self.answer2i.items():
            self.i2answer[v] = k

        # Retrieve key values
        self.no_words = len(self.word2i)
        self.no_answers = len(self.answer2i)

        self.unknown_question_token = self.word2i["<unk>"]
        self.padding_token = self.word2i["<unk>"]

        self.unknown_answer = self.answer2i["<unk>"]

Source File: tokenizers.py From DialoGPT with MIT License

5 votes

def clean_str(txt):
	#print("in=[%s]" % txt)
	txt = txt.lower()
	txt = re.sub('^',' ', txt)
	txt = re.sub('$',' ', txt)

	# url and tag
	words = []
	for word in txt.split():
		i = word.find('http') 
		if i >= 0:
			word = word[:i] + ' ' + '__url__'
		words.append(word.strip())
	txt = ' '.join(words)

	# remove markdown URL
	txt = re.sub(r'\[([^\]]*)\] \( *__url__ *\)', r'\1', txt)

	# remove illegal char
	txt = re.sub('__url__','URL',txt)
	txt = re.sub(r"[^A-Za-z0-9():,.!?\"\']", " ", txt)
	txt = re.sub('URL','__url__',txt)	

	# contraction
	add_space = ["'s", "'m", "'re", "n't", "'ll","'ve","'d","'em"]
	tokenizer = TweetTokenizer(preserve_case=False)
	txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '
	txt = txt.replace(" won't ", " will n't ")
	txt = txt.replace(" can't ", " can n't ")
	for a in add_space:
		txt = txt.replace(a+' ', ' '+a+' ')

	txt = re.sub(r'^\s+', '', txt)
	txt = re.sub(r'\s+$', '', txt)
	txt = re.sub(r'\s+', ' ', txt) # remove extra spaces
	
	#print("out=[%s]" % txt)
	return txt

Source File: reddit.py From DialoGPT with MIT License

5 votes

def gpt_norm_sentence(txt):
	# url and tag
	words = []
	for word in txt.split():
		if word[0] == '#': # don't allow tag
			continue
		i = word.lower().find('http')
		if i >= 0:
			word = word[:i] + ' ' + '__url__'
		words.append(word.strip())
	txt = ' '.join(words)

	# remove illegal char
	txt = txt.replace(chr(92),'') # chr(92) = '\'. as twitter has 'b\/c' rather than 'b/c'
	txt = txt.replace("b/c","because").replace('j/k','just kidding').replace('w/o','without').replace('w/','with')
	txt = re.sub('__mention__','MENTION',txt)
	txt = re.sub('__url__','URL',txt)
	txt = re.sub(r"[^A-Za-z0-9()\[\]:,.!?'“” ]", " ", txt)
	txt = re.sub('MENTION','__mention__',txt)
	txt = re.sub('URL','__url__',txt)

	tokenizer = TweetTokenizer(preserve_case=True)
	txt = ' ' + ' '.join(tokenizer.tokenize(txt)) + ' '

	# remove un-necessary space
	return ' '.join(txt.split())

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

4 votes

def get_features1(tweets, subj_dict):
    print("Getting features type 1...")
    features = []
    tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    lemmatizer = WordNetLemmatizer()
    # Take positive and negative noun/verb phrases as features
    for tweet in tweets:
        feature_list = [0.0] * 6
        tokens = tknzr.tokenize(tweet)
        pos = pos_tag(tokens)
        pos = [p for p in pos if 'VB' in p[1] or 'NN' in p[1]]
        for p in pos:
            stemmed = lemmatizer.lemmatize(p[0], 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if 'VB' in p[1] and stemmed in subj_dict:
                if 'verb' in subj_dict[stemmed]:
                    if 'positive' in subj_dict[stemmed]['verb']:
                        feature_list[0] += 1.0
                    if 'negative' in subj_dict[stemmed]['verb']:
                        feature_list[1] += 1.0
                elif 'anypos' in subj_dict[stemmed]:
                    if 'positive' in subj_dict[stemmed]['anypos']:
                        feature_list[0] += 1.0
                    if 'negative' in subj_dict[stemmed]['anypos']:
                        feature_list[1] += 1.0
            if 'NN' in p[1] and stemmed in subj_dict:
                if 'noun' in subj_dict[stemmed]:
                    if 'positive' in subj_dict[stemmed]['noun']:
                        feature_list[2] += 1.0
                    if 'negative' in subj_dict[stemmed]['noun']:
                        feature_list[3] += 1.0
                elif 'anypos' in subj_dict[stemmed]:
                    if 'positive' in subj_dict[stemmed]['anypos']:
                        feature_list[2] += 1.0
                    if 'negative' in subj_dict[stemmed]['anypos']:
                        feature_list[3] += 1.0
        # Derive features from punctuation
        feature_list[4] += count_apparitions(tokens, helper.punctuation)
        # Take the number of strong negations as a feature
        feature_list[5] += count_apparitions(tokens, helper.strong_negations)
        features.append(feature_list)
    print("Done.")
    return features

Python nltk.tokenize.TweetTokenizer() Examples