Python nltk.tokenize() Examples
The following are 30
code examples of nltk.tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: word.py From flambe with MIT License | 6 votes |
def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if self.exclude_stopwords and self.stop_words: example = ' '.join([word for word in word_tokenize(example) if word not in self.stop_words]) if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)
Example #2
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): for sent in para: for word, tag in sent: counts['words'] += 1 tokens[word] += 1 # Return data structure with information return { 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), }
Example #3
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load('sa_subjectivity.pickle') except LookupError: print('Cannot find the sentiment analyzer you want to load.') print('Training a new one using NaiveBayesClassifier.') sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text))
Example #4
Source File: phrasemachine.py From phrasemachine with MIT License | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #5
Source File: kaggle18.py From modin with Apache License 2.0 | 6 votes |
def tokenize(text): """ sent_tokenize(): segment text into sentences word_tokenize(): break sentences into words """ try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as e: print(text, e)
Example #6
Source File: load_data.py From Dense-CoAttention-Network with MIT License | 6 votes |
def tokenize_mcb(sentence): """ MCB tokenize implementation. -------------------- Arguments: sentence (str): a setence that will be tokenized. Return: A list of tokens from the sentence. """ for i in [r"\?", r"\!", r"\'", r"\"", r"\$", r"\:", r"\@", r"\(", r"\)", r"\,", r"\.", r"\;"]: sen = re.sub(i, "", sen) for i in [r"\-", r"\/"]: sen = re.sub(i, " ", sen) q_list = re.sub(r"\?", "", sen.lower()).split() q_list = list(filter(lambda x: len(x) > 0, q_list)) return q_list
Example #7
Source File: train_LDA.py From sato with Apache License 2.0 | 6 votes |
def process_col(col, **kwargs): numeric = kwargs['num'] # process the cols to return a bags of word representation if col.dtype == 'int64' or col.dtype =='float64': if numeric == 'directstr': return list(col.astype(str)) elif numeric == 'placeholder': return [str(col.dtype)] * len(col) if col.dtype == 'object': return tokenize(list(col.astype(str)), **kwargs) else: return list(col.astype(str)) return col
Example #8
Source File: experiments.py From clickbait with MIT License | 6 votes |
def handle_multiple_sentences(infile, outfile): titles = [] f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") for line in f: line = line.decode("utf-8") sentences = sent_detector.tokenize(line.strip()) for i in range(len(sentences)): if i == 0: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) else: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::") titles.append(" ".join(sentences)) title_set = set(titles) for l in title_set: print >> f2, l
Example #9
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #10
Source File: skip_thoughts_encoder.py From object_detection_kitti with Apache License 2.0 | 5 votes |
def _tokenize(self, item): """Tokenizes an input string into a list of words.""" tokenized = [] for s in self._sentence_detector.tokenize(item): tokenized.extend(nltk.tokenize.word_tokenize(s)) return tokenized
Example #11
Source File: phrasemachine.py From phrasemachine with MIT License | 5 votes |
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
Example #12
Source File: common.py From acl-anthology with Apache License 2.0 | 5 votes |
def tokenize(s): """Splits tokens (hyphens/slashes count as separate tokens).""" tokens = [] # NLTK tokenizer uses PTB standard, which doesn't split on hyphens or slashes for tok in nltk.tokenize.word_tokenize(s): # tokenizer normalizes quotes etc., so we need to detokenize later tokens.extend([t for t in re.split(r"([-–/])", tok) if t != ""]) return tokens
Example #13
Source File: preprocessing.py From open-solution-toxic-comments with MIT License | 5 votes |
def _use_stopwords(self, x): words = tokenizer.tokenize(x) words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example #14
Source File: kaggle18.py From modin with Apache License 2.0 | 5 votes |
def tokenize(text): try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as e: print(text, e)
Example #15
Source File: rake.py From rake-nltk with MIT License | 5 votes |
def extract_keywords_from_text(self, text): """Method to extract keywords from the text provided. :param text: Text to extract keywords from, provided as a string. """ sentences = nltk.tokenize.sent_tokenize(text) self.extract_keywords_from_sentences(sentences)
Example #16
Source File: ner.py From metadoc with MIT License | 5 votes |
def __init__(self, text): self.perceptron_tagger = AveragedPerceptronTagger(autoload=True) self.stopwords = set(nltk.corpus.stopwords.words()) self.top_fraction = 70 # consider top candidate keywords only self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') self.sentences = self.sent_detector.tokenize(text)
Example #17
Source File: text.py From open-solution-mapping-challenge with MIT License | 5 votes |
def _use_stopwords(self, x): words = tokenizer.tokenize(x) words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example #18
Source File: text.py From open-solution-mapping-challenge with MIT License | 5 votes |
def _apostrophes(self, x): words = tokenizer.tokenize(x) words = [APPO[word] if word in APPO else word for word in words] words = [lem.lemmatize(word, "v") for word in words] words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example #19
Source File: word.py From flambe with MIT License | 5 votes |
def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string Returns ------- List[str] The output word tokens, as a list of strings """ return example.split()
Example #20
Source File: data.py From dong_iccv_2017 with MIT License | 5 votes |
def split_sentence_into_words(sentence): tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(sentence.lower())
Example #21
Source File: rte_classify.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'have', 'is', 'are', 'were', 'and', 'very', '.',',']) self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
Example #22
Source File: chunked.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def read_block(self, stream): block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = self._str2chunktree(sent_str) # If requested, throw away the tags. if not self._tagged: sent = self._untag(sent) # If requested, throw away the chunks. if not self._chunked: sent = sent.leaves() # Add the sentence to `para`. if self._group_by_sent: para.append(sent) else: para.extend(sent) # Add the paragraph to `block`. if self._group_by_para: block.append(para) else: block.extend(para) # Return the block return block
Example #23
Source File: text.py From open-solution-data-science-bowl-2018 with MIT License | 5 votes |
def _apostrophes(self, x): words = tokenizer.tokenize(x) words = [APPO[word] if word in APPO else word for word in words] words = [lem.lemmatize(word, "v") for word in words] words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example #24
Source File: text.py From open-solution-data-science-bowl-2018 with MIT License | 5 votes |
def _use_stopwords(self, x): words = tokenizer.tokenize(x) words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example #25
Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _filter(self, stim): if self.tokenizer: tokens = self.tokenizer.tokenize(stim.text) else: tokens = word_tokenize(stim.text) stims = [TextStim(stim.filename, token, order=i) for i, token in enumerate(tokens)] return stims
Example #26
Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _filter(self, stim): pos_map = { 'ADJ': 'a', 'ADJ_SAT': 's', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' } def pos_wordnet(txt): pos_tagged = dict(nltk.pos_tag(txt, tagset='universal')) pos_tagged = {t: pos_map[tag] if tag in pos_map else 'n' for t, tag in pos_tagged.items()} return pos_tagged tokens = [stim.text] if self.tokenize: tokens = nltk.word_tokenize(tokens[0]) tokens = [t if self.case_sensitive else t.lower() for t in tokens] if not isinstance(self.stemmer, stem.WordNetLemmatizer): stemmed = ' '.join([self.stemmer.stem(t) for t in tokens]) else: pos_tagged = pos_wordnet(tokens) stemmed = ' '.join([self.stemmer.lemmatize(t, pos=pos_tagged[t]) for t in tokens]) return TextStim(stim.filename, stemmed, stim.onset, stim.duration, stim.order, stim.url)
Example #27
Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False, *args, **kwargs): if isinstance(stemmer, str): if stemmer not in self._stemmers: valid = list(self._stemmers.keys()) raise ValueError("Invalid stemmer '%s'; please use one of %s." % (stemmer, valid)) stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs) elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)): raise ValueError("stemmer must be either a valid string, or an " "instance of class StemmerI.") self.stemmer = stemmer self.tokenize = tokenize self.case_sensitive = case_sensitive super().__init__()
Example #28
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() started = time.time() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in self._sent_tokenizer.tokenize(para): counts['sents'] += 1 for word in self._word_tokenizer.tokenize(sent): counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self.resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self.resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
Example #29
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileids=None, categories=None): """ Segments, tokenizes, and tags a document in the corpus. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
Example #30
Source File: s2v_encoder.py From S2V with Apache License 2.0 | 5 votes |
def _tokenize(self, item): """Tokenizes an input string into a list of words.""" tokenized = [] for s in self._sentence_detector.tokenize(item): tokenized.extend(nltk.tokenize.word_tokenize(s)) return tokenized