Python nltk.tokenize.RegexpTokenizer() Examples
The following are 30
code examples of nltk.tokenize.RegexpTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: preprocessing.py From toxic_comments with MIT License | 7 votes |
def clean_text(df, wrong_words_dict, autocorrect=True): df.fillna("__NA__", inplace=True) tokinizer = RegexpTokenizer(r'\w+') regexps = [re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)")] texts = df.tolist() result = [] for text in tqdm(texts): tokens = tokinizer.tokenize(text.lower()) tokens = [split_text_and_digits(token, regexps) for token in tokens] tokens = [substitute_repeats(token, 3) for token in tokens] text = ' '.join(tokens) if autocorrect: for wrong, right in wrong_words_dict.items(): text = text.replace(wrong, right) result.append(text) return result
Example #2
Source File: aligned.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1'): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
Example #3
Source File: aligned.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__( self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1', ): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
Example #4
Source File: preprocessing.py From neupy with MIT License | 6 votes |
def transform(self, texts, y=None): tokenizer = RegexpTokenizer(r'[a-z]+|\d+') tokenized_texts = [] stoplist = [] if self.ignore_stopwords: stoplist = stopwords.words('english') for text in texts: tokenized_text = [] for word in tokenizer.tokenize(text.lower()): if word not in stoplist: tokenized_text.append(word.strip()) tokenized_texts.append(tokenized_text) return tokenized_texts
Example #5
Source File: topic_modeler.py From Artificial-Intelligence-with-Python with MIT License | 6 votes |
def process(input_text): # Create a regular expression tokenizer tokenizer = RegexpTokenizer(r'\w+') # Create a Snowball stemmer stemmer = SnowballStemmer('english') # Get the list of stop words stop_words = stopwords.words('english') # Tokenize the input string tokens = tokenizer.tokenize(input_text.lower()) # Remove the stop words tokens = [x for x in tokens if not x in stop_words] # Perform stemming on the tokenized words tokens_stemmed = [stemmer.stem(x) for x in tokens] return tokens_stemmed
Example #6
Source File: vocab_index_descriptions.py From caml-mimic with MIT License | 6 votes |
def vocab_index_descriptions(vocab_file, vectors_file): #load lookups vocab = set() with open(vocab_file, 'r') as vocabfile: for i,line in enumerate(vocabfile): line = line.strip() if line != '': vocab.add(line) ind2w = {i+1:w for i,w in enumerate(sorted(vocab))} w2ind = {w:i for i,w in ind2w.items()} desc_dict = datasets.load_code_descriptions() tokenizer = RegexpTokenizer(r'\w+') with open(vectors_file, 'w') as of: w = csv.writer(of, delimiter=' ') w.writerow(["CODE", "VECTOR"]) for code, desc in tqdm(desc_dict.items()): #same preprocessing steps as in get_discharge_summaries tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()] inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens] w.writerow([code] + [str(i) for i in inds])
Example #7
Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License | 6 votes |
def get_ngram_features_from_map(tweets, ngram_map, n): regexp_tknzr = RegexpTokenizer(r'\w+') tweet_tknzr = TweetTokenizer() features = [] for tweet in tweets: feature_list = [0] * np.zeros(len(ngram_map)) tweet = tweet.lower() ngram_list = get_ngram_list(tweet_tknzr, tweet, 1) if n > 1: ngram_list += get_ngram_list(regexp_tknzr, tweet, 2) if n > 2: ngram_list += get_ngram_list(regexp_tknzr, tweet, 3) for gram in ngram_list: if gram in ngram_map: feature_list[ngram_map[gram]] += 1.0 features.append(feature_list) return features
Example #8
Source File: aligned.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding=None): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader
Example #9
Source File: topic_modeling.py From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License | 5 votes |
def __init__(self): # Create a regular expression tokenizer self.tokenizer = RegexpTokenizer(r'\w+') # get the list of stop words self.english_stop_words= stopwords.words('english') # Create a Snowball stemmer self.snowball_stemmer = SnowballStemmer('english') # Tokenizing, stop word removal, and stemming
Example #10
Source File: query_processing.py From DarkDarkGo with MIT License | 5 votes |
def clean_text(dirty_text): """ Given a string, this function tokenizes the words of that string. :param dirty_text: string :return: list input = "American artist accomplishments american" output = ['accomplishments', 'american', 'artist'] """ lower_dirty_text = dirty_text.lower() regex_pattern = r"[\w']+" tokenizer = RegexpTokenizer(regex_pattern) tokens = tokenizer.tokenize(lower_dirty_text) unique_tokens = list(set(tokens)) return unique_tokens
Example #11
Source File: data.py From tagan with Apache License 2.0 | 5 votes |
def split_sentence_into_words(sentence): tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(sentence.lower())
Example #12
Source File: ycoe.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, root, items, encoding='utf8'): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer) #: A list of all documents and their titles in ycoe.
Example #13
Source File: ycoe.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, root, items, encoding='utf8'): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__( self, root, items, sep='_', sent_tokenizer=sent_tokenizer ) #: A list of all documents and their titles in ycoe.
Example #14
Source File: rte_classify.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is', 'have', 'are', 'were', 'and', 'very', '.', ',']) self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set(lemmatize(token) for token in self.text_tokens) self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
Example #15
Source File: datasets.py From AttnGAN with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #16
Source File: helpers.py From chirp with MIT License | 5 votes |
def get_tokens(text): """Tokenize the input text.""" soup = BeautifulSoup(text, "html.parser") tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(soup.get_text())
Example #17
Source File: lda_model_calculator.py From moviegeek with MIT License | 5 votes |
def build_lda_model(self, data, docs, n_topics=5): texts = [] tokenizer = RegexpTokenizer(r'\w+') for d in tqdm(data): raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = self.remove_stopwords(tokens) stemmed_tokens = stopped_tokens #stemmer = PorterStemmer() #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) index = similarities.MatrixSimilarity(corpus) self.save_lda_model(lda_model, corpus, dictionary, index) self.save_similarities(index, docs) return dictionary, texts, lda_model
Example #18
Source File: lda_model_calculator.py From moviegeek with MIT License | 5 votes |
def tokenize(self, data): tokenizer = RegexpTokenizer(r'\w+') return [tokenizer.tokenize(d) for d in data]
Example #19
Source File: datasets.py From DM-GAN with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #20
Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_ngrams(tweets, n): unigrams = Counter() bigrams = Counter() trigrams = Counter() regexp_tknzr = RegexpTokenizer(r'\w+') tweet_tknzr = TweetTokenizer() for tweet in tweets: tweet = tweet.lower() # Get the unigram list for this tweet and update the unigram counter unigram_list = get_ngram_list(tweet_tknzr, tweet, 1) unigrams.update(unigram_list) # Get the bigram list for this tweet and update the bigram counter if n > 1: bigram_list = get_ngram_list(regexp_tknzr, tweet, 2) bigrams.update(bigram_list) # Get the trigram list for this tweet and update the trigram counter if n > 2: trigram_list = get_ngram_list(regexp_tknzr, tweet, 3) trigrams.update(trigram_list) # Update the counters such that each n-gram appears at least min_occurence times min_occurence = 2 unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence] # In case using just unigrams, make the bigrams and trigrams empty bigram_tokens = trigram_tokens = [] if n > 1: bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence] if n > 2: trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence] return unigram_tokens, bigram_tokens, trigram_tokens
Example #21
Source File: data.py From dong_iccv_2017 with MIT License | 5 votes |
def split_sentence_into_words(sentence): tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(sentence.lower())
Example #22
Source File: IdentifyingTopic.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def cleanDocuments(self): tokenizer = RegexpTokenizer(r'[a-zA-Z]+') en_stop = set(stopwords.words('english')) self.cleaned = [] for doc in self.documents: lowercase_doc = doc.lower() words = tokenizer.tokenize(lowercase_doc) non_stopped_words = [i for i in words if not i in en_stop] self.cleaned.append(non_stopped_words) print("INFO: Clearning {} documents completed".format(len(self.documents)))
Example #23
Source File: semantic_image_synthesis_dataset.py From DMIT with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #24
Source File: ycoe.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, root, items, encoding=None): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer) #: A list of all documents and their titles in ycoe.
Example #25
Source File: Paste.py From AIL-framework with GNU Affero General Public License v3.0 | 5 votes |
def _get_top_words(self, sort=False): """ Tokenising method: Returning a sorted list or a set of paste's words :param sort: Selecting the output: sorted list or a set. (set by default) :return: set or sorted list of tuple [(word, occurency)...] :Example: PST._get_top_words(False) """ words = {} tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps=True, discard_empty=True) blob = TextBlob(clean( (self.get_p_content()) ), tokenizer=tokenizer) for word in blob.tokens: if word in words.keys(): num = words[word] else: num = 0 words[word] = num + 1 if sort: var = sorted(words.items(), key=operator.itemgetter(1), reverse=True) else: var = words return var
Example #26
Source File: datasets.py From multiple-objects-gan with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #27
Source File: datasets.py From semantic-object-accuracy-for-generative-text-to-image-synthesis with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #28
Source File: rte_classify.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'have', 'is', 'are', 'were', 'and', 'very', '.',',']) self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
Example #29
Source File: datasets.py From attn-gan with MIT License | 5 votes |
def load_captions(self, data_dir, filenames): all_captions = [] for i in range(len(filenames)): cap_path = '%s/text/%s.txt' % (data_dir, filenames[i]) with open(cap_path, "r") as f: captions = f.read().decode('utf8').split('\n') cnt = 0 for cap in captions: if len(cap) == 0: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_captions.append(tokens_new) cnt += 1 if cnt == self.embeddings_num: break if cnt < self.embeddings_num: print('ERROR: the captions for %s less than %d' % (filenames[i], cnt)) return all_captions
Example #30
Source File: topic_extractor.py From TBBTCorpus with Apache License 2.0 | 5 votes |
def __tokenize(self, docs): output = [] for doc in docs: tokenizer = RegexpTokenizer(r'\w\w\w\w\w+') output.append(tokenizer.tokenize(doc.lower())) return output