Python Examples of nltk.sent

Source File: kaggle.py From dl-models-for-qa with Apache License 2.0

8 votes

def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

6 votes

def flesch_kincaid_reading_ease(text, token_count):
    """
    Takes a text and returns its FK Reading Ease
    :param text: A string text
    :param token_count: the number of tokens in the text
    :return: FK Reading Ease
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def avg_syllables_per_word(text, token_count):
        syllable = syllable_count(text)
        if token_count > 0:
            return float(syllable) / float(token_count)
        else:
            return 0

    if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0:
        return 0

    ASL = float(token_count / len(nltk.sent_tokenize(text)))  # avg sentence length
    ASW = avg_syllables_per_word(text, token_count)
    FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
    return FKRA

Source File: extract_data.py From LSTM-CRF-models with MIT License

6 votes

def prepareSents(wrds):
    valid_sents=[]
    text=''.join(wrd[0] for wrd in wrds)
    sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)]
    text=[word for word in wrds if word[0]!=' ']
    sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list]
    idx=0
    s_idx=0
    while idx < len(text) and s_idx<len(sent_list):
        if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]):
            print "NLTK:"+ str(sent_list[s_idx])
            print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])])
        else:
            valid_sents+=[text[idx:idx+len(sent_list[s_idx])]]
        idx=idx+len(sent_list[s_idx])
        s_idx+=1
    return valid_sents

Source File: nlp.py From partisan-discourse with Apache License 2.0

6 votes

def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e)))

Source File: gender.py From atap with Apache License 2.0

6 votes

def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )

Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list

Source File: reddit_utils.py From neural_chat with MIT License

6 votes

def clean_thread_conversations(sub_str):
    conversations = []
    for mon in ['07', '08', '09', '10', '11', '12']:
        with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f:
            data = json.load(f)

        for thread in data:
            new_convo = {}
            new_convo['lines'] = []
            speaker = 0
            for msg in thread:
                text = clean_post(msg['text'])
                if len(text) > 1:
                    sentences = nltk.sent_tokenize(text)
                    for sent in sentences:
                        sent_dict = {}
                        sent_dict['character'] = speaker
                        sent_dict['text'] = sent
                        new_convo['lines'].append(sent_dict)
                    speaker = 1 - speaker
            if len(new_convo['lines']) > 1:
                conversations.append(new_convo)
    return conversations

Source File: psykey.py From tika-similarity with Apache License 2.0

6 votes

def __init__(self, text, wordlistfolder):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.sentenses = nltk.sent_tokenize(text)
        self.tags = nltk.pos_tag(self.tokens)

        self.featspace = []

        self.psykfeatspace(self.featspace, wordlistfolder)
        self.bigrams(self.featspace)
        self.number_count(self.featspace)
        self.punc_count(self.featspace)
        self.big_word_count(self.featspace)
        self.words_per_sentence(self.featspace)
        self.sentence_count(self.featspace)
        self.countPOS(self.featspace, 'CC')
        self.countPOS(self.featspace, 'NP')
        self.countPOS(self.featspace, 'NNP')
        self.words(self.featspace)
        self.stem(self.featspace)

    # Counts a specific POS tags

Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License

6 votes

def _from_text(self, text, unit, tokenizer, language):

        if tokenizer is not None:
            if isinstance(tokenizer, str):
                tokens = re.findall(tokenizer, text)
            else:
                tokens = tokenizer.tokenize(text)
        else:
            import nltk

            @requires_nltk_corpus
            def tokenize_text(text):
                if unit == 'word':
                    return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")

            tokens = tokenize_text(text)

        for i, t in enumerate(tokens):
            self._elements.append(TextStim(text=t, onset=None, duration=None,
                                  order=i))

Source File: sentence.py From Valx with GNU General Public License v3.0

6 votes

def sentence_splitting (texts, slen = 1):
	if len(texts) <= 0:
		return []
	
	# splitting
	sentences = []
	text_sents = sent_tokenize(texts)
	if (text_sents != [''] and len(text_sents) >  0):
		for sent in text_sents:
			sent = sent.strip().split('\r') # split strings that contains "\r"
			for sen in sent:
				se = sen.split('. ')
				for s in se: 
					if (NLP_word.words_counting(s) >= slen):
						sentences.append(s)

	return sentences


# splitting text into Sentences using NLTK tokenization

Source File: nltk_plugin.py From self-attentive-parser with MIT License

6 votes

def parse_sents(self, sents):
        """
        Parse multiple sentences

        If "sents" is a string, it will be segmented into sentences using NLTK.
        Otherwise, each element of "sents" will be treated as a sentence.

        sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse

        Returns: Iter[nltk.Tree]
        """
        if isinstance(sents, STRING_TYPES):
            if self._tokenizer_lang is None:
                raise ValueError(
                    "No tokenizer available for this language. "
                    "Please split into individual sentences and tokens "
                    "before calling the parser."
                    )
            sents = nltk.sent_tokenize(sents, self._tokenizer_lang)

        for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
            yield self._make_nltk_tree(sentence, tags_raw, *parse_raw)

Source File: preprocessor.py From atap with Apache License 2.0

5 votes

def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]

Source File: harvesttext.py From HarvestText with MIT License

5 votes

def cut_sentences(self, para, drop_empty_line=True, strip=True, deduplicate=False):
        '''cut_sentences

        :param para: 输入文本
        :param drop_empty_line: 是否丢弃空行
        :param strip: 是否对每一句话做一次strip
        :param deduplicate: 是否对连续标点去重，帮助对连续标点结尾的句子分句
        :return: sentences: list of str
        '''
        if deduplicate:
            para = re.sub(r"([。！？\!\?])\1+", r"\1", para)

        if self.language == 'en':
            from nltk import sent_tokenize
            sents = sent_tokenize(para)
            if strip:
                sents = [x.strip() for x in sents]
            if drop_empty_line:
                sents = [x for x in sents if len(x.strip()) > 0]
            return sents
        else:
            para = re.sub('([。！？\?!])([^”’])', r"\1\n\2", para)  # 单字符断句符
            para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
            para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
            para = re.sub('([。！？\?!][”’])([^，。！？\?])', r'\1\n\2', para)
            # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
            para = para.rstrip()  # 段尾如果有多余的\n就去掉它
            # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
            sentences = para.split("\n")
            if strip:
                sentences = [sent.strip() for sent in sentences]
            if drop_empty_line:
                sentences = [sent for sent in sentences if len(sent.strip()) > 0]
            return sentences

Source File: find_full_text_sentence.py From indra with BSD 2-Clause "Simplified" License

5 votes

def sentence_tokenize(self, text):
        #return text.split('.')
        return sent_tokenize(text)

Source File: tokenize.py From pywsd with MIT License

5 votes

def word_tokenize(text, language='english', preserve_line=False):
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [token for sent in sentences
            for token in _treebank_word_tokenizer.tokenize(sent)]

Source File: zoo.py From razdel with MIT License

5 votes

def nltk_sentenize(text):
    from nltk import sent_tokenize

    chunks = sent_tokenize(text, 'russian')
    return find_substrings(chunks, text)

Source File: analyzer.py From homer with MIT License

5 votes

def __init__(self, paragraph):
        paragraph = paragraph.replace('—', ' ')
        self.paragraph = paragraph
        self.tokenized_sentences = nltk.sent_tokenize(paragraph)
        self._sentences = [Sentence(sentence) for sentence in self.tokenized_sentences]

Source File: SensationalismClassifier.py From news-audit with GNU General Public License v3.0

5 votes

def transform(self, text_fields):
        stats = []
        punctuation = string.punctuation
        abvs = ['CNN', 'FBI', 'ABC', 'MSNBC', 'GOP', 'U.S.', 'US', 'ISIS', 'DNC', 'TV', 'CIA', 'I', 'AP', 'PM', 'AM', 'EU', 'USA', 'UK', 'UN', 'CEO', 'NASA', 'LGBT', 'LGBTQ', 'NAFTA', 'ACLU']
        for field in text_fields:
            field_stats = {}
            tok_text = nltk.word_tokenize(field)
            try:
                num_upper = float(len([w for w in tok_text if w.isupper() and w not in abvs]))/len(tok_text)
            except:
                num_upper = 0
            try:
                num_punct = float(len([ch for ch in field if ch in punctuation]))/len(field)
            except:
                num_punct = 0   
            try:
                sent_lengths = [len(nltk.word_tokenize(s)) for s in nltk.sent_tokenize(field)]
                av_sent_len = float(sum(sent_lengths))/len(sent_lengths)
            except:
                av_sent_len = 0
            try:
                num_prof = float(len([w for w in tok_text if w.lower() in PROFANITY]))/len(tok_text)
            except:
                num_prof = 0

            polarity, subjectivity = sentiment(field)
            field_stats['all_caps'] = num_upper
            field_stats['sent_len'] = av_sent_len
            field_stats['polarity'] = polarity
            field_stats['subjectivity'] = subjectivity
            field_stats['profanity'] = num_prof
            stats.append(field_stats)
        return stats

Source File: sent_parsing.py From atap with Apache License 2.0

5 votes

def sents(paragraph):
    for sentence in sent_tokenize(paragraph):
        yield sentence

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def coleman_liau_index(text, token_count):
    """
    Takes a text and returns its Coleman Liau Index
    :param text: A string text
    :return: Coleman Liau Index
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def char_count(text):
        """
        Function to return total character counts in a text
        """
        count_chars = 0
        text = text.replace(" ", "")
        for char in text:
            if char not in string.punctuation:
                count_chars += 1
        return count_chars

    def avg_letters_per_word(text):
        ALPW = float(float(char_count(text)) / token_count)
        return ALPW

    def avg_sentence_per_word(text):
        ASPW = float(len(nltk.sent_tokenize(text)) / float(token_count))
        return ASPW

    if token_count <= 0:
        return 0

    L = avg_letters_per_word(text) * 100  # avg letters per 100 words
    S = avg_sentence_per_word(text) * 100  # avg sentences per 100 words
    CLI = float((0.0588 * L) - (0.296 * S) - 15.8)
    return CLI

Source File: preprocess.py From atap with Apache License 2.0

5 votes

def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]

Source File: reader.py From atap with Apache License 2.0

5 votes

def sents(self, fileids=None, categories=None):
        """
        Uses the built in sentence tokenizer to extract sentences from the
        paragraphs. Note that this method uses BeautifulSoup to parse HTML.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in sent_tokenize(paragraph):
                yield sentence

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def automated_readability_index(text, token_count):
    """
    Takes a text and returns its Automated Readability Index
    :param text: A string text without punctuation
    :return: Automated Readability Index
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)
    def char_count(text):
        """
        Function to return total character counts in a text
        """
        count_chars = 0
        text = text.replace(" ", "")
        for char in text:
            if char not in string.punctuation:
                count_chars += 1
        return count_chars

    chrs = char_count(text)
    wrds = token_count
    snts = len(nltk.sent_tokenize(text))

    if wrds == 0 or snts == 0:
        return 0

    a = (float(chrs) / float(wrds))
    b = (float(wrds) / float(snts))
    ARI = (4.71 * a) + (0.5 * b) - 21.43
    return ARI

Source File: oz.py From atap with Apache License 2.0

5 votes

def matrix(text, cast):
    mtx = []
    for first in cast:
        row = []
        for second in cast:
            count = 0
            for title, chapter in text['chapters'].items():
                for sent in sent_tokenize(chapter):
                    if first in sent and second in sent:
                        count += 1
            row.append(count)
        mtx.append(row)
    return mtx

Source File: oz.py From atap with Apache License 2.0

5 votes

def cooccurrence(text, cast):
    possible_pairs = list(itertools.combinations(cast, 2))
    cooccurring = dict.fromkeys(possible_pairs, 0)
    for title, chapter in text['chapters'].items():
        for sent in sent_tokenize(chapter):
            for pair in possible_pairs:
                if pair[0] in sent and pair[1] in sent:
                    cooccurring[pair] += 1
    return cooccurring

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def lix_index(text, token_count):
    """
    A readability measure developed by Carl-Hug BjÃ¶rnsson
    Formula adapted from: https://en.wikipedia.org/wiki/LIX
    :param text: A string text without punctuation
    :return: LIX Index
    """

    def get_long_word_count(text):
        """
        Returns the number of words with more than 6 letters
        """
        long_word_count = 0
        for word in nltk.word_tokenize(text):
            if len(word) > 6:
                long_word_count += 1
        return long_word_count

    A = token_count  # number of words
    B = 0  # number of sentences (also split at ':')
    for sent in nltk.sent_tokenize(text):
        B += len(re.split(':', sent))
    C = get_long_word_count(text)  # number of words with more than 6 letters
    if B > 0 and A > 0:
        LIX = float(A / B) + float((C * 100) / A)
        return LIX
    else:
        return 0

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def rix_index(text):
    """
    A readability measure developed by Anderson, simplifies LIX index
    Anderson, Jonathan. "Analysing the Radability of English and Non-English
    Texts in the Classroom with Lix"
    source: http://www.jstor.org/stable/40031755?seq=1#page_scan_tab_contents
    :param text: A string text without punctuation
    :return: RIX Index
    """

    def get_long_word_count(text):
        """
        Returns the number of words with more than 6 letters
        """
        long_word_count = 0
        for word in nltk.word_tokenize(text):
            if len(word) > 6:
                long_word_count += 1
        return long_word_count

    sent_count = 0  # number of sentences (also split at ':' and ';')
    for sent in nltk.sent_tokenize(text):
        sent_count += len(re.split('[:;]', sent))

    long_word_count = get_long_word_count(text)  # number of words with mroe than 6 letters

    if sent_count > 0:
        return float(long_word_count / sent_count)
    else:
        return 0

Source File: readability_indices.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def mcalpine_eflaw_index(text):
    """
    A readability score defined by Rachel McAlpine
    See https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/

    EFLAW index = (#tokens + #miniwords) / #sentences

    :param text: A string text without punctuation
    :return: McAlpine EFLAW Index
    """

    tokenized_sents = nltk.sent_tokenize(text)
    sentence_count = len(tokenized_sents)
    token_count = 0
    miniword_count = 0  # words with 1,2 or 3 letters
    for sent in tokenized_sents:
        for token in nltk.word_tokenize(sent):
            if token not in string.punctuation:
                token_count += 1
                if len(token) <= 3:
                    miniword_count += 1

    if sentence_count >= 1:
        return float((token_count + miniword_count) / sentence_count)
    else:
        return 0

Source File: ngrams.py From atap with Apache License 2.0

5 votes

def ngrams2(text, n=2):
    for sent in sent_tokenize(text):
        sent = word_tokenize(sent)
        for ngram in nltk_ngrams(sent, n):
            yield ngram

Source File: readability.py From serapis with MIT License

5 votes

def __init__(self, doc):
        """
        Args:
            doc: str
        """
        self.doc = unidecode(doc)
        self.sentence_count = len(sent_tokenize(doc))
        words = word_tokenize(doc)
        syllables = [self._count_syllables(word) for word in words]
        self.char_count = sum(len(word) for word in words)
        self.syllable_count = sum(syllables)
        self._invalid = not self.sentence_count or not self.char_count
        self.complex_word_count = len(filter(lambda s: s >= 4, syllables))
        self.word_count = len(words)
        self.words_per_sentence = 1.0 * self.word_count / self.sentence_count if not self._invalid else 0

Python nltk.sent_tokenize() Examples