Python nltk.tokenize.sent_tokenize() Examples
The following are 30
code examples of nltk.tokenize.sent_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: DataReader.py From MachineLearningSamples-BiomedicalEntityExtraction with MIT License | 6 votes |
def get_feature_vectors_1 (self, data_list): print("Reading unlabeled data from dataframe") # list of list of tokens all_sentences_words = [] # Process all lines in the file for line in data_list: text = line.strip() #break the input text into sentences before tokenization sentences = sent_tokenize(text) for sent in sentences: sentence_words = nltk.word_tokenize(sent) all_sentences_words.append( tuple(sentence_words) ) self.n_sentences_all = len(all_sentences_words) print("number of unlabeled examples = {}".format(self.n_sentences_all)) return self.create_feature_vectors(all_sentences_words) ################################################## # create_feature_vectors ##################################################
Example #2
Source File: do_sentence_segmentation.py From training with Apache License 2.0 | 6 votes |
def process_one_file(one_input): """Separate paragraphs into sentences, for one file.""" input_filename = one_input + args.input_suffix output_filename = one_input + args.output_suffix logging.info('Processing %s => %s', input_filename, output_filename) with io.open(input_filename, 'r', encoding='utf-8') as fin: with io.open(output_filename, 'w', encoding='utf-8') as fout: for line in fin: if len(line) == 1: fout.write(u'\n') sents = sent_tokenize(line) for sent in sents: sent_str = sent.strip() # if sent_str: fout.write('%s\n' % sent_str) fout.write(u'\n')
Example #3
Source File: doc2vec.py From broca with MIT License | 6 votes |
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True): """ Generator to feed sentences to the dov2vec model. """ i = 0 p = Progress() for path in paths: with open(path, 'r') as f: for line in f: i += 1 p.print_progress(i/n) # We do minimal pre-processing here so the model can learn # punctuation line = line.lower() if sentences: for sent in sent_tokenize(line): tokens = tokenizer(sent) yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) else: tokens = tokenizer(line) yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
Example #4
Source File: batcher.py From docker with MIT License | 6 votes |
def fill_example_queue(self, data_path, mode = "test"): new_queue =[] filelist = glob.glob(data_path) # get the list of datafiles assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty filelist = sorted(filelist) if mode == "train": filelist = filelist for f in filelist: reader = codecs.open(f, 'r', 'utf-8') while True: string_ = reader.readline() if not string_: break dict_example = json.loads(string_) review = dict_example["review"] if(len(sent_tokenize(review))<2): continue example = Example(review, self._vocab, self._hps) new_queue.append(example) return new_queue
Example #5
Source File: prepare_clc_fce_data.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def main(): fce = convert_fce(args.fce_dataset_path) with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \ open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied: for doc in tqdm(fce, unit='doc'): sents = re.split(r"\n +\n", doc) for sent in sents: tokenized_sents = sent_tokenize(sent) for i in range(len(tokenized_sents)): if re.search(r"[{>][.?!]$", tokenized_sents[i]): tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1] tokenized_sents[i] = "" regexp = r'{([^{}]*?)=>([^{}]*?)}' original = re.sub(regexp, r"\1", tokenized_sents[i]) applied = re.sub(regexp, r"\2", tokenized_sents[i]) # filter out nested alerts if original != "" and applied != "" and not re.search(r"[{}=]", original) \ and not re.search(r"[{}=]", applied): out_original.write(" ".join(word_tokenize(original)) + "\n") out_applied.write(" ".join(word_tokenize(applied)) + "\n")
Example #6
Source File: corpus_cleaner.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def parse_xml_all(self, data_file, doc_type, language='english'): e = ET.parse(data_file) cluster_data = {} root = e.getroot() for topics in root: data = [] topic_id = topics.attrib.get('id') for documents in topics.findall(doc_type): doc_id = documents.attrib.get('id') if doc_type == 'document': title_text = documents.find('title').text doc_text = documents.find('text').text text = text_normalization(doc_text) doc_sents = sent_tokenize(text, language) data.append([doc_id, doc_sents]) cluster_data[topic_id] = data return cluster_data
Example #7
Source File: summarizer.py From delbot with GNU Affero General Public License v3.0 | 6 votes |
def summarize(self, text, n): """ Return a list of n sentences which represent the summary of text. """ sents = sent_tokenize(text) assert n <= len(sents) word_sent = [word_tokenize(s.lower()) for s in sents] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i,sent in enumerate(word_sent): for w in sent: if w in self._freq: ranking[i] += self._freq[w] sents_idx = self._rank(ranking, n) return [sents[j] for j in sents_idx]
Example #8
Source File: kaggle18.py From modin with Apache License 2.0 | 6 votes |
def tokenize(text): """ sent_tokenize(): segment text into sentences word_tokenize(): break sentences into words """ try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as e: print(text, e)
Example #9
Source File: word2vec.py From Bidirectiona-LSTM-for-text-summarization- with MIT License | 6 votes |
def createCorpus(t): corpus = [] all_sent = [] for k in t: for p in t[k]: corpus.append(st(p)) for sent in range(len(corpus)): for k in corpus[sent]: all_sent.append(k) for m in range(len(all_sent)): all_sent[m] = wt(all_sent[m]) all_words=[] for sent in all_sent: hold=[] for word in sent: hold.append(word.lower()) all_words.append(hold) return all_words
Example #10
Source File: Word_Frequency_Summarization.py From nlp-akash with MIT License | 6 votes |
def run_summarization(text): # 1 Create the word frequency table freq_table = _create_frequency_table(text) ''' We already have a sentence tokenizer, so we just need to run the sent_tokenize() method to create the array of sentences. ''' # 2 Tokenize the sentences sentences = sent_tokenize(text) # 3 Important Algorithm: score the sentences sentence_scores = _score_sentences(sentences, freq_table) # 4 Find the threshold threshold = _find_average_score(sentence_scores) # 5 Important Algorithm: Generate the summary summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold) return summary
Example #11
Source File: DataReader.py From MachineLearningSamples-BiomedicalEntityExtraction with MIT License | 6 votes |
def get_feature_vectors_2 (self, data_file): print("Loading unlabeled data from file {}".format(data_file)) with open(data_file, 'r') as f_data: all_sentences_words = [] # Process all lines in the file for line in f_data: text = line.strip() #break the input text into sentences before tokenization sentences = sent_tokenize(text) for sent in sentences: sentence_words = nltk.word_tokenize(sent) all_sentences_words.append( tuple(sentence_words) ) self.n_sentences_all = len(all_sentences_words) print("number of unlabeled examples = {}".format(self.n_sentences_all)) return self.create_feature_vectors(all_sentences_words) ################################################## # get_feature_vectors_1 ##################################################
Example #12
Source File: labled_tsv_to_tfrecords_single_sentences.py From bran with Apache License 2.0 | 6 votes |
def convert_to_single_sentence(doc_str, e1_start, e1_end, e2_start, e2_end, annotation_map): offsets = zip(e1_start+e2_start, e1_end+e2_end, [1]*len(e1_start)+[2]*len(e2_start)) offsets = sorted(offsets, key=lambda tup: tup[0]) replaced_doc_str = [process_single_annotation(doc_str, 0, s, e, annotation_map, i, ent_id) if i == 0 else process_single_annotation(doc_str, offsets[i-1][1], s, e, annotation_map, i, ent_id) for i, (s, e, ent_id) in enumerate(offsets)] replaced_doc_str.append(' '.join(doc_str[offsets[-1][1]:])) new_doc_str = ''.join(replaced_doc_str) ## TODO only works for data with single e1 and e2 mention sentences = sent_tokenize(new_doc_str.replace('@@ ', '').decode('utf-8')) tokenized_sents = [tokenize(s) for s in sentences] chosen_sent = [i for i, s in enumerate(sentences) if s.count(ENTITY_STRING) >= 2] if chosen_sent: if FLAGS.full_abstract: replaced_sent = [annotation_map[w] if w in annotation_map else w for s in tokenized_sents for w in s] else: idx = chosen_sent[0] s_idx = max(0, idx - FLAGS.sentence_window) e_idx = min(idx + FLAGS.sentence_window+1, len(tokenized_sents)) window_sentences = [tokenized_sents[i] for i in (range(s_idx, e_idx))] replaced_sent = [annotation_map[w] if w in annotation_map else w for s in window_sentences for w in s] return replaced_sent
Example #13
Source File: extras.py From semeval2017-scienceie with Apache License 2.0 | 6 votes |
def offset_tokenize(text): tail = text accum = 0 tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)] info_tokens = [] for tok in tokens: scaped_tok = re.escape(tok) m = re.search(scaped_tok, tail) start, end = m.span() # global offsets gs = accum + start ge = accum + end accum += end # keep searching in the rest tail = tail[end:] info_tokens.append((tok, (gs, ge))) return info_tokens
Example #14
Source File: NewsArticleClass.py From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 | 6 votes |
def summarize(self, article, n): text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i, sentence in enumerate(word_sent): for word in sentence: if word in self._freq: ranking[i] += self._freq[word] sentences_index = nlargest(n, ranking, key=ranking.get) return [sentences[j] for j in sentences_index] ############################################################################## # TEST
Example #15
Source File: util.py From camr with GNU General Public License v2.0 | 6 votes |
def find_abr_fullname(doc,query,Num): """Find the query(abbreviation's) full name within the document. Parameters: doc: the document to be searched for(specified format) query: the abbreviation Num: the number of sentences before the query to be looked for fullname (here we asume that all the fullname of the query appeared before the query) """ sents = [word_tokenize(t) for t in sent_tokenize(doc)] for i,sent in enumerate(sents): if query in sent: fullname = find_abr_fn(sent,query) if fullname != -1: return fullname else: j = 1 while i-j >= 0 and j <= Num: if find_abr_fn(sent[i-j],query) == -1: j+=1 else: return find_abr_fn(sent[i-j],query) raise Exception('No query in the document.')
Example #16
Source File: NewsArticleClass.py From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 | 6 votes |
def extractFeatures(self, article, n, customStopWords=None): # pass in article as a tuple ( text, title) text = article[0] # extract the text title = article[1] # extract the title sentences = sent_tokenize(text) # split text into sentences word_sent = [word_tokenize(sentences.lower()) for a in sentences] # split sentences into words self._freq = self._compute_frequencies(word_sent, customStopWords) # calculate word freq using member func created above if n < 0: # how many features (words) to return - a -ve number means # no feature ( word) selection, just return all features return nlargest(len(self._freq_keys()), self._freq, key=self._freq.get) else: # here we say if calling e func has asked for a subset # then return only the 'n' largest features, i.e. the # most important words ( important == frequent, less stopwords) return nlargest(n, self._freq, key=self._freq.get)
Example #17
Source File: background.py From language with Apache License 2.0 | 6 votes |
def score_sentences(query, doc_json, entity, sentence_scores, max_sentence_len, n=3): """Score sentences with respect to the query.""" sentences = tokenize.sent_tokenize(doc_json['text']) query_ngrams = util.get_ngrams(tokenize.word_tokenize(query), n) for sentence in sentences: sentence_tokens = tokenize.word_tokenize(sentence) tokens = tokenize.word_tokenize( entity['wikipedia_name']) + [':'] + sentence_tokens[:max_sentence_len] sentence_ngrams = util.get_ngrams(tokens, n) score = len(set(sentence_ngrams).intersection(query_ngrams)) / max( 1, len(query_ngrams)) sentence_scores.append((entity, sentence_tokens), score)
Example #18
Source File: create_pretraining_data.py From language with Apache License 2.0 | 6 votes |
def split_into_sentences(text, doc_annotations, tokenizer): """Split into sentences and return bookkeeping info.""" sentences = [] sentences_starts = [] sentence_annotations = [] doc_annotations = sorted(doc_annotations, key=lambda x: x[2]) annotation_idx = 0 sentences_text = tokenize.sent_tokenize(text) token_idx = 0 for sentence_text in sentences_text: sub_tokens, word_starts = tokenizer.tokenize(sentence_text) sentences.append(sub_tokens) sentences_starts.append(word_starts) sentence_annotations.append([]) token_idx += len(sentence_text.split(" ")) while annotation_idx < len( doc_annotations) and doc_annotations[annotation_idx][2] < token_idx: sentence_annotations[-1].append(doc_annotations[annotation_idx]) annotation_idx += 1 return sentences, sentences_starts, sentence_annotations
Example #19
Source File: preprocess.py From serapis with MIT License | 6 votes |
def paragraph_to_sentences(paragraph, term): """ Turns a paragraph into clean, preprocessed sentences """ result = [] paragraph = re.sub(r"([^ ])([\(\[\"])", r"\1 \2", paragraph) # Give brackets space to breathe paragraph = re.sub(r"([\)\]\"\!\?:])([^ ])", r"\1 \2", paragraph) paragraph = re.sub(r"([^. ]{3})\.([^. ]{3}|A |An )", r"\1. \2", paragraph) paragraph = re.sub(r" e\.?g\.? ", " _eg_ ", paragraph) # sent_tokenize improperly splits sentences here paragraph = re.sub(r" i\.?e\.? ", " _ie_ ", paragraph) sentences = sent_tokenize(paragraph) for sentence in sentences: sentence = sentence.replace("_eg_", "_e.g._").replace("_ie_", "i.e.") # reverts edge case processed = preprocess_sentence(sentence, term) if qualify_sentence(processed): result.append(processed) return result # Sentences ########################
Example #20
Source File: text_summarizer.py From nlp_url_summarizer with MIT License | 6 votes |
def summarize(self, text, n): """ Return a list of n sentences which represent the summary of text. """ sents = sent_tokenize(text) assert n <= len(sents) word_sent = [word_tokenize(s.lower()) for s in sents] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i,sent in enumerate(word_sent): for w in sent: if w in self._freq: ranking[i] += self._freq[w] sents_idx = self._rank(ranking, n) return [sents[j] for j in sents_idx]
Example #21
Source File: textsum_data_convert.py From TextSum with MIT License | 6 votes |
def _convert_files_to_binary(input_filenames, output_filename): with open(output_filename, 'wb') as writer: for filename in input_filenames: with open(filename, 'r') as f: document = f.read() document_parts = document.split('\n', 1) assert len(document_parts) == 2 title = '<d><p><s>' + document_parts[0] + '</s></p></d>' body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ') sentences = sent_tokenize(body) body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>' body = body.encode('utf8') tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([body]) tf_example.features.feature['abstract'].bytes_list.value.extend([title]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str))
Example #22
Source File: vocabulary.py From topicModelling with GNU General Public License v3.0 | 6 votes |
def doc_to_ids(self, doc, training=True): l = [] words = dict() window = 150 # doc = doc.replace("–", " ") # doc = sent_tokenize(doc) for sentence in doc: miniArray = [] for term in sentence: id = self.term_to_id(term, training) if id != None: miniArray.append(id) if not id in words: words[id] = 1 self.docfreq[id] += 1 if not len(miniArray): continue if len(miniArray) > window: l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)]) else: l.append(np.array(miniArray)) return l
Example #23
Source File: Article.py From find-all-the-new-words with MIT License | 5 votes |
def out_put_important_sentences(self): pp_m_article = re.sub(r"\n",r".\n",self.marked_article) sentences = tokenize.sent_tokenize(pp_m_article) i_sentences = [_ if self.pattern.search(_) else None for _ in sentences] write_important_sentances_to_file("./others/",self.name, "\n\n".join(list(filter(None,i_sentences))))
Example #24
Source File: main.py From Python-DevOps with MIT License | 5 votes |
def tokenizer(string): return [word_tokenize(t) for t in sent_tokenize(s)]
Example #25
Source File: vocabulary_sentenceLayer.py From topicModelling with GNU General Public License v3.0 | 5 votes |
def doc_to_ids(self, doc, training=True): l = [] words = dict() doc_sents = sent_tokenize(doc) for sentence in doc_sents: miniArray = [] for term in sentence.split(): id = self.term_to_id(term, training) if id != None: miniArray.append(id) if not id in words: words[id] = 1 self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq() l.append(np.array(miniArray, dtype=np.int32)) return l
Example #26
Source File: preprocess.py From BERT-pytorch with The Unlicense | 5 votes |
def detect_sentences(raw_documents_path, sentences_detected_path, **_): with open(raw_documents_path) as raw_documents_file, open(sentences_detected_path, 'w') as sentences_detected_file: for line in tqdm(raw_documents_file): sentences = sent_tokenize(line.strip()) tokenized_sentences = [] for sentence in sentences: sentence = sentence.lower() sentence = NUMBERS.sub('N', sentence) tokens = [match.group() for match in TOKENIZATION.finditer(sentence)] if not tokens: continue tokenized_sentences.append(' '.join(tokens)) output_line = '|'.join(tokenized_sentences) + '\n' sentences_detected_file.write(output_line)
Example #27
Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def split_line_by_sentences(line): return sent_tokenize(line)
Example #28
Source File: books_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def split_line_by_sentences(line): return sent_tokenize(line)
Example #29
Source File: summarize.py From Django-Bookworm with MIT License | 5 votes |
def __init__(self, text): self.__text = text self.__stop_words = stopwords.words('english') self.__sentence = sent_tokenize(text) self.__f_text = self.create_formatted_text() self.__word_freq = self.calc_word_frequencies()
Example #30
Source File: phrases.py From broca with MIT License | 5 votes |
def _phrase_doc_stream(paths, n, tokenizer=word_tokenize): """ Generator to feed sentences to the phrase model. """ i = 0 p = Progress() for path in paths: with open(path, 'r') as f: for line in f: i += 1 p.print_progress(i/n) for sent in sent_tokenize(line.lower()): tokens = tokenizer(sent) yield tokens