Python nltk.sent_tokenize() Examples
The following are 30
code examples of nltk.sent_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: kaggle.py From dl-models-for-qa with Apache License 2.0 | 8 votes |
def get_story_question_answer_triples(sqa_file): sqatriples = [] fsqa = open(sqa_file, "rb") for line in fsqa: line = line.strip().decode("utf8").encode("ascii", "ignore") if line.startswith("#"): continue story, question, answer, correct = line.split("\t") swords = [] story_sents = nltk.sent_tokenize(story) for story_sent in story_sents: swords.extend(nltk.word_tokenize(story_sent)) qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) is_correct = int(correct) == 1 sqatriples.append((swords, qwords, awords, is_correct)) fsqa.close() return sqatriples
Example #2
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 6 votes |
def flesch_kincaid_reading_ease(text, token_count): """ Takes a text and returns its FK Reading Ease :param text: A string text :param token_count: the number of tokens in the text :return: FK Reading Ease """ # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat) def avg_syllables_per_word(text, token_count): syllable = syllable_count(text) if token_count > 0: return float(syllable) / float(token_count) else: return 0 if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0: return 0 ASL = float(token_count / len(nltk.sent_tokenize(text))) # avg sentence length ASW = avg_syllables_per_word(text, token_count) FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW) return FKRA
Example #3
Source File: extract_data.py From LSTM-CRF-models with MIT License | 6 votes |
def prepareSents(wrds): valid_sents=[] text=''.join(wrd[0] for wrd in wrds) sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)] text=[word for word in wrds if word[0]!=' '] sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list] idx=0 s_idx=0 while idx < len(text) and s_idx<len(sent_list): if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]): print "NLTK:"+ str(sent_list[s_idx]) print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])]) else: valid_sents+=[text[idx:idx+len(sent_list[s_idx])]] idx=idx+len(sent_list[s_idx]) s_idx+=1 return valid_sents
Example #4
Source File: nlp.py From partisan-discourse with Apache License 2.0 | 6 votes |
def preprocess(html): """ Returns a preprocessed document consisting of a list of paragraphs, which is a list of sentences, which is a list of tuples, where each tuple is a (token, part of speech) pair. """ try: return [ [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ] for paragraph in para_tokenize(html) ] except Exception as e: raise NLTKError("could not preprocess text: {}".format(str(e)))
Example #5
Source File: gender.py From atap with Apache License 2.0 | 6 votes |
def parse_gender(text): sentences = [ [word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(text) ] sents, words = count_gender(sentences) total = sum(words.values()) for gender, count in words.items(): pcent = (count / total) * 100 nsents = sents[gender] print( "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents) )
Example #6
Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def extract_nnp_phrases(text): """ NNP extractor convenience method. :param text: :return: """ phrase_list = [] for sentence in nltk.sent_tokenize(text): # Get POS tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) # Get POS phrase = [] for t, p in pos: if p in ["NNP", "NNPS"] or t in [",", "&"]: phrase.append(t) else: if len(phrase) > 1: phrase_list.append(clean_nnp_phrase(phrase)) phrase = [] return phrase_list
Example #7
Source File: reddit_utils.py From neural_chat with MIT License | 6 votes |
def clean_thread_conversations(sub_str): conversations = [] for mon in ['07', '08', '09', '10', '11', '12']: with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f: data = json.load(f) for thread in data: new_convo = {} new_convo['lines'] = [] speaker = 0 for msg in thread: text = clean_post(msg['text']) if len(text) > 1: sentences = nltk.sent_tokenize(text) for sent in sentences: sent_dict = {} sent_dict['character'] = speaker sent_dict['text'] = sent new_convo['lines'].append(sent_dict) speaker = 1 - speaker if len(new_convo['lines']) > 1: conversations.append(new_convo) return conversations
Example #8
Source File: psykey.py From tika-similarity with Apache License 2.0 | 6 votes |
def __init__(self, text, wordlistfolder): self.text = text self.tokens = nltk.word_tokenize(text) self.sentenses = nltk.sent_tokenize(text) self.tags = nltk.pos_tag(self.tokens) self.featspace = [] self.psykfeatspace(self.featspace, wordlistfolder) self.bigrams(self.featspace) self.number_count(self.featspace) self.punc_count(self.featspace) self.big_word_count(self.featspace) self.words_per_sentence(self.featspace) self.sentence_count(self.featspace) self.countPOS(self.featspace, 'CC') self.countPOS(self.featspace, 'NP') self.countPOS(self.featspace, 'NNP') self.words(self.featspace) self.stem(self.featspace) # Counts a specific POS tags
Example #9
Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _from_text(self, text, unit, tokenizer, language): if tokenizer is not None: if isinstance(tokenizer, str): tokens = re.findall(tokenizer, text) else: tokens = tokenizer.tokenize(text) else: import nltk @requires_nltk_corpus def tokenize_text(text): if unit == 'word': return nltk.word_tokenize(text, language) elif unit.startswith('sent'): return nltk.sent_tokenize(text, language) else: raise ValueError( "unit must be either 'word' or 'sentence'") tokens = tokenize_text(text) for i, t in enumerate(tokens): self._elements.append(TextStim(text=t, onset=None, duration=None, order=i))
Example #10
Source File: sentence.py From Valx with GNU General Public License v3.0 | 6 votes |
def sentence_splitting (texts, slen = 1): if len(texts) <= 0: return [] # splitting sentences = [] text_sents = sent_tokenize(texts) if (text_sents != [''] and len(text_sents) > 0): for sent in text_sents: sent = sent.strip().split('\r') # split strings that contains "\r" for sen in sent: se = sen.split('. ') for s in se: if (NLP_word.words_counting(s) >= slen): sentences.append(s) return sentences # splitting text into Sentences using NLTK tokenization
Example #11
Source File: nltk_plugin.py From self-attentive-parser with MIT License | 6 votes |
def parse_sents(self, sents): """ Parse multiple sentences If "sents" is a string, it will be segmented into sentences using NLTK. Otherwise, each element of "sents" will be treated as a sentence. sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse Returns: Iter[nltk.Tree] """ if isinstance(sents, STRING_TYPES): if self._tokenizer_lang is None: raise ValueError( "No tokenizer available for this language. " "Please split into individual sentences and tokens " "before calling the parser." ) sents = nltk.sent_tokenize(sents, self._tokenizer_lang) for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)): yield self._make_nltk_tree(sentence, tags_raw, *parse_raw)
Example #12
Source File: preprocessor.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
Example #13
Source File: harvesttext.py From HarvestText with MIT License | 5 votes |
def cut_sentences(self, para, drop_empty_line=True, strip=True, deduplicate=False): '''cut_sentences :param para: 输入文本 :param drop_empty_line: 是否丢弃空行 :param strip: 是否对每一句话做一次strip :param deduplicate: 是否对连续标点去重,帮助对连续标点结尾的句子分句 :return: sentences: list of str ''' if deduplicate: para = re.sub(r"([。!?\!\?])\1+", r"\1", para) if self.language == 'en': from nltk import sent_tokenize sents = sent_tokenize(para) if strip: sents = [x.strip() for x in sents] if drop_empty_line: sents = [x for x in sents if len(x.strip()) > 0] return sents else: para = re.sub('([。!?\?!])([^”’])', r"\1\n\2", para) # 单字符断句符 para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号 para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号 para = re.sub('([。!?\?!][”’])([^,。!?\?])', r'\1\n\2', para) # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 para = para.rstrip() # 段尾如果有多余的\n就去掉它 # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 sentences = para.split("\n") if strip: sentences = [sent.strip() for sent in sentences] if drop_empty_line: sentences = [sent for sent in sentences if len(sent.strip()) > 0] return sentences
Example #14
Source File: find_full_text_sentence.py From indra with BSD 2-Clause "Simplified" License | 5 votes |
def sentence_tokenize(self, text): #return text.split('.') return sent_tokenize(text)
Example #15
Source File: tokenize.py From pywsd with MIT License | 5 votes |
def word_tokenize(text, language='english', preserve_line=False): sentences = [text] if preserve_line else sent_tokenize(text, language) return [token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)]
Example #16
Source File: zoo.py From razdel with MIT License | 5 votes |
def nltk_sentenize(text): from nltk import sent_tokenize chunks = sent_tokenize(text, 'russian') return find_substrings(chunks, text)
Example #17
Source File: analyzer.py From homer with MIT License | 5 votes |
def __init__(self, paragraph): paragraph = paragraph.replace('—', ' ') self.paragraph = paragraph self.tokenized_sentences = nltk.sent_tokenize(paragraph) self._sentences = [Sentence(sentence) for sentence in self.tokenized_sentences]
Example #18
Source File: SensationalismClassifier.py From news-audit with GNU General Public License v3.0 | 5 votes |
def transform(self, text_fields): stats = [] punctuation = string.punctuation abvs = ['CNN', 'FBI', 'ABC', 'MSNBC', 'GOP', 'U.S.', 'US', 'ISIS', 'DNC', 'TV', 'CIA', 'I', 'AP', 'PM', 'AM', 'EU', 'USA', 'UK', 'UN', 'CEO', 'NASA', 'LGBT', 'LGBTQ', 'NAFTA', 'ACLU'] for field in text_fields: field_stats = {} tok_text = nltk.word_tokenize(field) try: num_upper = float(len([w for w in tok_text if w.isupper() and w not in abvs]))/len(tok_text) except: num_upper = 0 try: num_punct = float(len([ch for ch in field if ch in punctuation]))/len(field) except: num_punct = 0 try: sent_lengths = [len(nltk.word_tokenize(s)) for s in nltk.sent_tokenize(field)] av_sent_len = float(sum(sent_lengths))/len(sent_lengths) except: av_sent_len = 0 try: num_prof = float(len([w for w in tok_text if w.lower() in PROFANITY]))/len(tok_text) except: num_prof = 0 polarity, subjectivity = sentiment(field) field_stats['all_caps'] = num_upper field_stats['sent_len'] = av_sent_len field_stats['polarity'] = polarity field_stats['subjectivity'] = subjectivity field_stats['profanity'] = num_prof stats.append(field_stats) return stats
Example #19
Source File: sent_parsing.py From atap with Apache License 2.0 | 5 votes |
def sents(paragraph): for sentence in sent_tokenize(paragraph): yield sentence
Example #20
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def coleman_liau_index(text, token_count): """ Takes a text and returns its Coleman Liau Index :param text: A string text :return: Coleman Liau Index """ # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat) def char_count(text): """ Function to return total character counts in a text """ count_chars = 0 text = text.replace(" ", "") for char in text: if char not in string.punctuation: count_chars += 1 return count_chars def avg_letters_per_word(text): ALPW = float(float(char_count(text)) / token_count) return ALPW def avg_sentence_per_word(text): ASPW = float(len(nltk.sent_tokenize(text)) / float(token_count)) return ASPW if token_count <= 0: return 0 L = avg_letters_per_word(text) * 100 # avg letters per 100 words S = avg_sentence_per_word(text) * 100 # avg sentences per 100 words CLI = float((0.0588 * L) - (0.296 * S) - 15.8) return CLI
Example #21
Source File: preprocess.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
Example #22
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def sents(self, fileids=None, categories=None): """ Uses the built in sentence tokenizer to extract sentences from the paragraphs. Note that this method uses BeautifulSoup to parse HTML. """ for paragraph in self.paras(fileids, categories): for sentence in sent_tokenize(paragraph): yield sentence
Example #23
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def automated_readability_index(text, token_count): """ Takes a text and returns its Automated Readability Index :param text: A string text without punctuation :return: Automated Readability Index """ # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat) def char_count(text): """ Function to return total character counts in a text """ count_chars = 0 text = text.replace(" ", "") for char in text: if char not in string.punctuation: count_chars += 1 return count_chars chrs = char_count(text) wrds = token_count snts = len(nltk.sent_tokenize(text)) if wrds == 0 or snts == 0: return 0 a = (float(chrs) / float(wrds)) b = (float(wrds) / float(snts)) ARI = (4.71 * a) + (0.5 * b) - 21.43 return ARI
Example #24
Source File: oz.py From atap with Apache License 2.0 | 5 votes |
def matrix(text, cast): mtx = [] for first in cast: row = [] for second in cast: count = 0 for title, chapter in text['chapters'].items(): for sent in sent_tokenize(chapter): if first in sent and second in sent: count += 1 row.append(count) mtx.append(row) return mtx
Example #25
Source File: oz.py From atap with Apache License 2.0 | 5 votes |
def cooccurrence(text, cast): possible_pairs = list(itertools.combinations(cast, 2)) cooccurring = dict.fromkeys(possible_pairs, 0) for title, chapter in text['chapters'].items(): for sent in sent_tokenize(chapter): for pair in possible_pairs: if pair[0] in sent and pair[1] in sent: cooccurring[pair] += 1 return cooccurring
Example #26
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def lix_index(text, token_count): """ A readability measure developed by Carl-Hug Björnsson Formula adapted from: https://en.wikipedia.org/wiki/LIX :param text: A string text without punctuation :return: LIX Index """ def get_long_word_count(text): """ Returns the number of words with more than 6 letters """ long_word_count = 0 for word in nltk.word_tokenize(text): if len(word) > 6: long_word_count += 1 return long_word_count A = token_count # number of words B = 0 # number of sentences (also split at ':') for sent in nltk.sent_tokenize(text): B += len(re.split(':', sent)) C = get_long_word_count(text) # number of words with more than 6 letters if B > 0 and A > 0: LIX = float(A / B) + float((C * 100) / A) return LIX else: return 0
Example #27
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def rix_index(text): """ A readability measure developed by Anderson, simplifies LIX index Anderson, Jonathan. "Analysing the Radability of English and Non-English Texts in the Classroom with Lix" source: http://www.jstor.org/stable/40031755?seq=1#page_scan_tab_contents :param text: A string text without punctuation :return: RIX Index """ def get_long_word_count(text): """ Returns the number of words with more than 6 letters """ long_word_count = 0 for word in nltk.word_tokenize(text): if len(word) > 6: long_word_count += 1 return long_word_count sent_count = 0 # number of sentences (also split at ':' and ';') for sent in nltk.sent_tokenize(text): sent_count += len(re.split('[:;]', sent)) long_word_count = get_long_word_count(text) # number of words with mroe than 6 letters if sent_count > 0: return float(long_word_count / sent_count) else: return 0
Example #28
Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def mcalpine_eflaw_index(text): """ A readability score defined by Rachel McAlpine See https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/ EFLAW index = (#tokens + #miniwords) / #sentences :param text: A string text without punctuation :return: McAlpine EFLAW Index """ tokenized_sents = nltk.sent_tokenize(text) sentence_count = len(tokenized_sents) token_count = 0 miniword_count = 0 # words with 1,2 or 3 letters for sent in tokenized_sents: for token in nltk.word_tokenize(sent): if token not in string.punctuation: token_count += 1 if len(token) <= 3: miniword_count += 1 if sentence_count >= 1: return float((token_count + miniword_count) / sentence_count) else: return 0
Example #29
Source File: ngrams.py From atap with Apache License 2.0 | 5 votes |
def ngrams2(text, n=2): for sent in sent_tokenize(text): sent = word_tokenize(sent) for ngram in nltk_ngrams(sent, n): yield ngram
Example #30
Source File: readability.py From serapis with MIT License | 5 votes |
def __init__(self, doc): """ Args: doc: str """ self.doc = unidecode(doc) self.sentence_count = len(sent_tokenize(doc)) words = word_tokenize(doc) syllables = [self._count_syllables(word) for word in words] self.char_count = sum(len(word) for word in words) self.syllable_count = sum(syllables) self._invalid = not self.sentence_count or not self.char_count self.complex_word_count = len(filter(lambda s: s >= 4, syllables)) self.word_count = len(words) self.words_per_sentence = 1.0 * self.word_count / self.sentence_count if not self._invalid else 0