Python Examples of nltk.tokenize.sent

Source File: DataReader.py From MachineLearningSamples-BiomedicalEntityExtraction with MIT License

6 votes

def get_feature_vectors_1 (self, data_list):        

        print("Reading unlabeled data from dataframe")   
        # list of list of tokens
        all_sentences_words = []           

        # Process all lines in the file
        for line in data_list:
            text = line.strip()                                

            #break the input text into sentences before tokenization
            sentences = sent_tokenize(text)
            
            for sent in sentences:
                sentence_words = nltk.word_tokenize(sent)                             
                all_sentences_words.append( tuple(sentence_words) )                                                                                             
        
        self.n_sentences_all = len(all_sentences_words)        
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ################################################## 
    #   create_feature_vectors
    ##################################################

Source File: do_sentence_segmentation.py From training with Apache License 2.0

6 votes

def process_one_file(one_input):
  """Separate paragraphs into sentences, for one file."""
  input_filename = one_input + args.input_suffix
  output_filename = one_input + args.output_suffix
  logging.info('Processing %s => %s', input_filename, output_filename)
  with io.open(input_filename, 'r', encoding='utf-8') as fin:
    with io.open(output_filename, 'w', encoding='utf-8') as fout:
      for line in fin:
        if len(line) == 1:
          fout.write(u'\n')
        sents = sent_tokenize(line)
        for sent in sents:
          sent_str = sent.strip()
          # if sent_str:
          fout.write('%s\n' % sent_str)
      fout.write(u'\n')

Source File: doc2vec.py From broca with MIT License

6 votes

def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)])

Source File: batcher.py From docker with MIT License

6 votes

def fill_example_queue(self, data_path, mode = "test"):

        new_queue =[]

        filelist = glob.glob(data_path)  # get the list of datafiles
        assert filelist, ('Error: Empty filelist at %s' % data_path)  # check filelist isn't empty
        filelist = sorted(filelist)
        if mode == "train":
            filelist = filelist

        for f in filelist:


            reader = codecs.open(f, 'r', 'utf-8')
            while True:
                string_ = reader.readline()
                if not string_: break
                dict_example = json.loads(string_)
                review = dict_example["review"]
                if(len(sent_tokenize(review))<2):
                    continue
                example = Example(review, self._vocab, self._hps)
                new_queue.append(example)
        return new_queue

Source File: prepare_clc_fce_data.py From NLP_Toolkit with Apache License 2.0

6 votes

def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n")

Source File: corpus_cleaner.py From acl2017-interactive_summarizer with Apache License 2.0

6 votes

def parse_xml_all(self, data_file, doc_type, language='english'):
        e = ET.parse(data_file)
        cluster_data = {}
        root = e.getroot()
        for topics in root:
            data = []
            topic_id = topics.attrib.get('id')
            for documents in topics.findall(doc_type):
                doc_id = documents.attrib.get('id')
                if doc_type == 'document':
                    title_text = documents.find('title').text
                doc_text = documents.find('text').text
                text = text_normalization(doc_text)
                doc_sents = sent_tokenize(text, language)
                data.append([doc_id, doc_sents])
            cluster_data[topic_id] = data
        return cluster_data

Source File: summarizer.py From delbot with GNU Affero General Public License v3.0

6 votes

def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]

Source File: kaggle18.py From modin with Apache License 2.0

6 votes

def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try:
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        text = regex.sub(" ", text)  # remove punctuation
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
        return filtered_tokens
    except TypeError as e:
        print(text, e)

Source File: word2vec.py From Bidirectiona-LSTM-for-text-summarization- with MIT License

6 votes

def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words

Source File: Word_Frequency_Summarization.py From nlp-akash with MIT License

6 votes

def run_summarization(text):
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''

    # 2 Tokenize the sentences
    sentences = sent_tokenize(text)

    # 3 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(sentences, freq_table)

    # 4 Find the threshold
    threshold = _find_average_score(sentence_scores)

    # 5 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

    return summary

Source File: DataReader.py From MachineLearningSamples-BiomedicalEntityExtraction with MIT License

6 votes

def get_feature_vectors_2 (self, data_file):        

        print("Loading unlabeled data from file {}".format(data_file))
        with open(data_file, 'r') as f_data:                                    
            all_sentences_words = []
                 

            # Process all lines in the file
            for line in f_data:
                text = line.strip()                                

                #break the input text into sentences before tokenization
                sentences = sent_tokenize(text)
                
                for sent in sentences:
                    sentence_words = nltk.word_tokenize(sent)                             
                    all_sentences_words.append( tuple(sentence_words) )                                                           
        
        self.n_sentences_all = len(all_sentences_words)   
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ##################################################
    #  get_feature_vectors_1  
    ##################################################

Source File: labled_tsv_to_tfrecords_single_sentences.py From bran with Apache License 2.0

6 votes

def convert_to_single_sentence(doc_str, e1_start, e1_end, e2_start, e2_end, annotation_map):
    offsets = zip(e1_start+e2_start, e1_end+e2_end, [1]*len(e1_start)+[2]*len(e2_start))
    offsets = sorted(offsets, key=lambda tup: tup[0])
    replaced_doc_str = [process_single_annotation(doc_str, 0, s, e, annotation_map, i, ent_id) if i == 0
                        else
                        process_single_annotation(doc_str, offsets[i-1][1], s, e, annotation_map, i, ent_id)
                        for i, (s, e, ent_id) in enumerate(offsets)]

    replaced_doc_str.append(' '.join(doc_str[offsets[-1][1]:]))
    new_doc_str = ''.join(replaced_doc_str)

    ## TODO only works for data with single e1 and e2 mention
    sentences = sent_tokenize(new_doc_str.replace('@@ ', '').decode('utf-8'))
    tokenized_sents = [tokenize(s) for s in sentences]
    chosen_sent = [i for i, s in enumerate(sentences) if s.count(ENTITY_STRING) >= 2]
    if chosen_sent:
        if FLAGS.full_abstract:
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in tokenized_sents for w in s]
        else:
            idx = chosen_sent[0]
            s_idx = max(0, idx - FLAGS.sentence_window)
            e_idx = min(idx + FLAGS.sentence_window+1, len(tokenized_sents))
            window_sentences = [tokenized_sents[i] for i in (range(s_idx, e_idx))]
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in window_sentences for w in s]
        return replaced_sent

Source File: extras.py From semeval2017-scienceie with Apache License 2.0

6 votes

def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens

Source File: NewsArticleClass.py From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0

6 votes

def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST

Source File: util.py From camr with GNU General Public License v2.0

6 votes

def find_abr_fullname(doc,query,Num):
    """Find the query(abbreviation's) full name within the document.
       Parameters:
       doc: the document to be searched for(specified format) 
       query: the abbreviation
       Num: the number of sentences before the query to be looked for fullname
       (here we asume that all the fullname of the query appeared before the query)
    """
    sents = [word_tokenize(t) for t in sent_tokenize(doc)]
    for i,sent in enumerate(sents):
        if query in sent:
            fullname = find_abr_fn(sent,query)
            if fullname != -1:
                return fullname
            else:
                j = 1
                while i-j >= 0 and j <= Num: 
                    if find_abr_fn(sent[i-j],query) == -1:
                        j+=1
                    else:
                        return find_abr_fn(sent[i-j],query)
                
    raise Exception('No query in the document.')

Source File: NewsArticleClass.py From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0

6 votes

def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)

Source File: background.py From language with Apache License 2.0

6 votes

def score_sentences(query,
                    doc_json,
                    entity,
                    sentence_scores,
                    max_sentence_len,
                    n=3):
  """Score sentences with respect to the query."""
  sentences = tokenize.sent_tokenize(doc_json['text'])
  query_ngrams = util.get_ngrams(tokenize.word_tokenize(query), n)
  for sentence in sentences:
    sentence_tokens = tokenize.word_tokenize(sentence)
    tokens = tokenize.word_tokenize(
        entity['wikipedia_name']) + [':'] + sentence_tokens[:max_sentence_len]
    sentence_ngrams = util.get_ngrams(tokens, n)
    score = len(set(sentence_ngrams).intersection(query_ngrams)) / max(
        1, len(query_ngrams))
    sentence_scores.append((entity, sentence_tokens), score)

Source File: create_pretraining_data.py From language with Apache License 2.0

6 votes

def split_into_sentences(text, doc_annotations, tokenizer):
  """Split into sentences and return bookkeeping info."""
  sentences = []
  sentences_starts = []
  sentence_annotations = []
  doc_annotations = sorted(doc_annotations, key=lambda x: x[2])
  annotation_idx = 0
  sentences_text = tokenize.sent_tokenize(text)
  token_idx = 0
  for sentence_text in sentences_text:
    sub_tokens, word_starts = tokenizer.tokenize(sentence_text)
    sentences.append(sub_tokens)
    sentences_starts.append(word_starts)
    sentence_annotations.append([])
    token_idx += len(sentence_text.split(" "))
    while annotation_idx < len(
        doc_annotations) and doc_annotations[annotation_idx][2] < token_idx:
      sentence_annotations[-1].append(doc_annotations[annotation_idx])
      annotation_idx += 1
  return sentences, sentences_starts, sentence_annotations

Source File: preprocess.py From serapis with MIT License

6 votes

def paragraph_to_sentences(paragraph, term):
    """
    Turns a paragraph into clean, preprocessed sentences
    """
    result = []
    paragraph = re.sub(r"([^ ])([\(\[\"])", r"\1 \2", paragraph)  # Give brackets space to breathe
    paragraph = re.sub(r"([\)\]\"\!\?:])([^ ])", r"\1 \2", paragraph)
    paragraph = re.sub(r"([^. ]{3})\.([^. ]{3}|A |An )", r"\1. \2", paragraph)
    paragraph = re.sub(r" e\.?g\.? ", " _eg_ ", paragraph)  # sent_tokenize improperly splits sentences here
    paragraph = re.sub(r" i\.?e\.? ", " _ie_ ", paragraph)
    sentences = sent_tokenize(paragraph)
    for sentence in sentences:
        sentence = sentence.replace("_eg_", "_e.g._").replace("_ie_", "i.e.")  # reverts edge case
        processed = preprocess_sentence(sentence, term)
        if qualify_sentence(processed):
            result.append(processed)
    return result


# Sentences
########################

Source File: text_summarizer.py From nlp_url_summarizer with MIT License

6 votes

def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx]

Source File: textsum_data_convert.py From TextSum with MIT License

6 votes

def _convert_files_to_binary(input_filenames, output_filename):
  with open(output_filename, 'wb') as writer:
    for filename in input_filenames:
      with open(filename, 'r') as f:
        document = f.read()
    
      document_parts = document.split('\n', 1)
      assert len(document_parts) == 2
    
      title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
      
      body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
      sentences = sent_tokenize(body)
      body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
      body = body.encode('utf8')
    
      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([body])
      tf_example.features.feature['abstract'].bytes_list.value.extend([title])
      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)
      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds' % str_len, tf_example_str))

Source File: vocabulary.py From topicModelling with GNU General Public License v3.0

6 votes

def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        window = 150
#        doc = doc.replace("&ndash;", " ")
#        doc = sent_tokenize(doc)
        for sentence in doc:
            miniArray = []
            for term in sentence:
                id = self.term_to_id(term, training)    
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1
            if not len(miniArray):
                continue
            if len(miniArray)  > window:
                l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
            else:
                l.append(np.array(miniArray))
        return l

Source File: Article.py From find-all-the-new-words with MIT License

5 votes

def out_put_important_sentences(self):
        pp_m_article = re.sub(r"\n",r".\n",self.marked_article)
        sentences = tokenize.sent_tokenize(pp_m_article)
        i_sentences = [_ if self.pattern.search(_) else None for _ in sentences]
        write_important_sentances_to_file("./others/",self.name, "\n\n".join(list(filter(None,i_sentences))))

Source File: main.py From Python-DevOps with MIT License

5 votes

def tokenizer(string):
    return [word_tokenize(t) for t in sent_tokenize(s)]

Source File: vocabulary_sentenceLayer.py From topicModelling with GNU General Public License v3.0

5 votes

def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        doc_sents = sent_tokenize(doc)
        for sentence in doc_sents:
            miniArray = []
            for term in sentence.split():
                id = self.term_to_id(term, training)
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
            l.append(np.array(miniArray, dtype=np.int32))
        return l

Source File: preprocess.py From BERT-pytorch with The Unlicense

5 votes

def detect_sentences(raw_documents_path, sentences_detected_path, **_):
    with open(raw_documents_path) as raw_documents_file, open(sentences_detected_path, 'w') as sentences_detected_file:
        for line in tqdm(raw_documents_file):
            sentences = sent_tokenize(line.strip())
            tokenized_sentences = []
            for sentence in sentences:
                sentence = sentence.lower()
                sentence = NUMBERS.sub('N', sentence)
                tokens = [match.group() for match in TOKENIZATION.finditer(sentence)]
                if not tokens:
                    continue
                tokenized_sentences.append(' '.join(tokens))

            output_line = '|'.join(tokenized_sentences) + '\n'
            sentences_detected_file.write(output_line)

Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0

5 votes

def split_line_by_sentences(line):
  return sent_tokenize(line)

Source File: books_preproc_pipeline.py From language with Apache License 2.0

5 votes

def split_line_by_sentences(line):
  return sent_tokenize(line)

Source File: summarize.py From Django-Bookworm with MIT License

5 votes

def __init__(self, text):
        self.__text        = text
        self.__stop_words  = stopwords.words('english')
        self.__sentence    = sent_tokenize(text)
        self.__f_text      = self.create_formatted_text()
        self.__word_freq   = self.calc_word_frequencies()

Source File: phrases.py From broca with MIT License

5 votes

def _phrase_doc_stream(paths, n, tokenizer=word_tokenize):
    """
    Generator to feed sentences to the phrase model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)
                for sent in sent_tokenize(line.lower()):
                    tokens = tokenizer(sent)
                    yield tokens

Python nltk.tokenize.sent_tokenize() Examples