Python Examples of nltk.FreqDist

Source File: utils.py From BERT with Apache License 2.0

8 votes

def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d

Source File: analysis.py From DiSAN with Apache License 2.0

7 votes

def do_analysis(dataset_obj):
        # 1. all sample classification distribution
        # 2. all sentence sample classification distribution
        sample_num = dataset_obj.sample_num
        collect = []
        sent_collect = []
        for trees in dataset_obj.nn_data:
            for sample in trees:
                sentiment_float = sample['root_node']['sentiment_label']
                sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
                if sample['is_sent']:
                    sent_collect.append(sentiment_int)
                collect.append(sentiment_int)
        all_pdf = nltk.FreqDist(collect)
        sent_pdf = nltk.FreqDist(sent_collect)
        print('sample_num:', sample_num)
        print('all')
        print(all_pdf.tabulate())
        print('sent')
        print(sent_pdf.tabulate())

Source File: similarity.py From ConvNetPy with MIT License

7 votes

def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)]

Source File: lexicon.py From CrisisLex with MIT License

6 votes

def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs):
        self.terms = terms  # the terms used to build the lexicon
        self.documents = documents
        self.classes = classes
        self.terms_frequency = frequency
        self.terms_frequency_per_class = dict()
        self.main_class = main_class
        # the minimum support for a term (i.e., number of documents in the class of interest in order to be considered)
        self.min_docs = min_docs
        self.class_occ = dict()
        for c in class_types:
            self.terms_frequency_per_class[c]=nltk.FreqDist()
            self.class_occ[c] = classes.count(c)
        for i, doc in enumerate(self.documents):
            cls = self.classes[i]
            for t in doc:
                self.terms_frequency_per_class[cls].inc(t)

    # the scoring functions return the list of discriminative terms for the class of interest according to each metric

Source File: reader.py From atap with Apache License 2.0

6 votes

def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        }

Source File: nlp.py From ReSAN with Apache License 2.0

6 votes

def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf}

Source File: nlp.py From DiSAN with Apache License 2.0

6 votes

def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf}

Source File: similarity.py From ConvNetPy with MIT License

6 votes

def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data

Source File: lang_model_2.py From jakaton_feminicidios with MIT License

6 votes

def __init__(self, order, alpha, sentences):
        self.order = order
        self.alpha = alpha
        if order > 1:
            self.backoff = LangModel(order - 1, alpha, sentences)
            self.lexicon = None
        else:
            self.backoff = None
            self.n = 0
        self.ngramFD = nltk.FreqDist()
        lexicon = set()
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            wordNGrams = nltk.ngrams(words, order)
            for wordNGram in wordNGrams:
                self.ngramFD[wordNGram] += 1
                # self.ngramFD.inc(wordNGram)
                if order == 1:
                    lexicon.add(wordNGram)
                    self.n += 1
        self.v = len(lexicon)

Source File: topics.py From ConvNetPy with MIT License

6 votes

def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data

Source File: nlp.py From BiBloSA with Apache License 2.0

6 votes

def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf}

Source File: analysis.py From BiBloSA with Apache License 2.0

6 votes

def do_analysis(dataset_obj):
        # 1. all sample classification distribution
        # 2. all sentence sample classification distribution
        sample_num = dataset_obj.sample_num
        collect = []
        sent_collect = []
        for trees in dataset_obj.nn_data:
            for sample in trees:
                sentiment_float = sample['root_node']['sentiment_label']
                sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
                if sample['is_sent']:
                    sent_collect.append(sentiment_int)
                collect.append(sentiment_int)
        all_pdf = nltk.FreqDist(collect)
        sent_pdf = nltk.FreqDist(sent_collect)
        print('sample_num:', sample_num)
        print('all')
        print(all_pdf.tabulate())
        print('sent')
        print(sent_pdf.tabulate())

Source File: dialogue.py From ConvNetPy with MIT License

6 votes

def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data

Source File: nlp.py From BiBloSA with Apache License 2.0

6 votes

def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf}

Source File: tldr.py From SML-Cogs with MIT License

6 votes

def _calculate_word_scores(self, phrase_list):
        word_freq = nltk.FreqDist()
        word_degree = nltk.FreqDist()
        for phrase in phrase_list:
            # degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
            # SML above cost error
            degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1
            for word in phrase:
                # word_freq.inc(word)
                # SML error above:
                word_freq[word] += 1
                # word_degree.inc(word, degree) # other words
                word_degree[word] = degree
        for word in word_freq.keys():
            word_degree[word] = word_degree[word] + word_freq[word] # itself
        # word score = deg(w) / freq(w)
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_degree[word] / word_freq[word]
        return word_scores

Source File: Trainer.py From truecase with Apache License 2.0

5 votes

def __init__(self):
        self.uni_dist = nltk.FreqDist()
        self.backward_bi_dist = nltk.FreqDist()
        self.forward_bi_dist = nltk.FreqDist()
        self.trigram_dist = nltk.FreqDist()
        self.word_casing_lookup = {}

Source File: nlp.py From BiBloSA with Apache License 2.0

5 votes

def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None):
    ratio = float(ratio)
    if add is not None:
        ratio += add
        ratio = ratio if ratio < 1 else 1
    if security:
        ratio = ratio if ratio < 0.99 else 0.99
    def calculate_dynamic_len(pdf ,ratio_ = ratio):
        cdf = []
        previous = 0
        # accumulate
        for len ,freq in pdf:
            previous += freq
            cdf.append((len, previous))
        # calculate
        for len ,accu in cdf:
            if 1.0 * accu/ previous >= ratio_:  # satisfy the condition
                return len, cdf[-1][0]
        # max
        return cdf[-1][0], cdf[-1][0]

    pdf = dict(nltk.FreqDist(lengthList))
    pdf = sorted(pdf.items(), key=lambda d: d[0])

    if fileName is not None:
        with open(fileName, 'w') as f:
            for len, freq in pdf:
                f.write('%d\t%d' % (len, freq))
                f.write(os.linesep)

    return calculate_dynamic_len(pdf, ratio)

Source File: data.py From artificial_neural_networks with Apache License 2.0

5 votes

def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist

Source File: data.py From artificial_neural_networks with Apache License 2.0

5 votes

def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist

Source File: print_english_words.py From adversarial-squad with MIT License

5 votes

def main():
  freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
  vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
  for w in vocab:
    print w

Source File: nlp.py From BiBloSA with Apache License 2.0

5 votes

def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None):
    ratio = float(ratio)
    if add is not None:
        ratio += add
        ratio = ratio if ratio < 1 else 1
    if security:
        ratio = ratio if ratio < 0.99 else 0.99
    def calculate_dynamic_len(pdf ,ratio_ = ratio):
        cdf = []
        previous = 0
        # accumulate
        for len ,freq in pdf:
            previous += freq
            cdf.append((len, previous))
        # calculate
        for len ,accu in cdf:
            if 1.0 * accu/ previous >= ratio_:  # satisfy the condition
                return len, cdf[-1][0]
        # max
        return cdf[-1][0], cdf[-1][0]

    pdf = dict(nltk.FreqDist(lengthList))
    pdf = sorted(pdf.items(), key=lambda d: d[0])

    if fileName is not None:
        with open(fileName, 'w') as f:
            for len, freq in pdf:
                f.write('%d\t%d' % (len, freq))
                f.write(os.linesep)

    return calculate_dynamic_len(pdf, ratio)

Source File: adaptive_collect.py From CrisisLex with MIT License

5 votes

def set_adaptive(self, lex, learning_time=3, use_hashtags=True, new_terms_no=10):
        self.adaptive = True
        self.lex_set = set(lex)
        self.terms_no = new_terms_no
        self.use_hashtags = use_hashtags

        self.start_time = datetime.datetime.now()
        self.end_time = self.start_time + datetime.timedelta(hours=learning_time)
        print "Learning interval between %s to %s"%(self.start_time,self.end_time)

        self.terms_fd = nltk.FreqDist()
        self.terms = []

Source File: nlp.py From BiBloSA with Apache License 2.0

5 votes

def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None):
    ratio = float(ratio)
    if add is not None:
        ratio += add
        ratio = ratio if ratio < 1 else 1
    if security:
        ratio = ratio if ratio < 0.99 else 0.99
    def calculate_dynamic_len(pdf ,ratio_ = ratio):
        cdf = []
        previous = 0
        # accumulate
        for len ,freq in pdf:
            previous += freq
            cdf.append((len, previous))
        # calculate
        for len ,accu in cdf:
            if 1.0 * accu/ previous >= ratio_:  # satisfy the condition
                return len, cdf[-1][0]
        # max
        return cdf[-1][0], cdf[-1][0]

    pdf = dict(nltk.FreqDist(lengthList))
    pdf = sorted(pdf.items(), key=lambda d: d[0])

    if fileName is not None:
        with open(fileName, 'w') as f:
            for len, freq in pdf:
                f.write('%d\t%d' % (len, freq))
                f.write(os.linesep)

    return calculate_dynamic_len(pdf, ratio)

Source File: ner.py From metadoc with MIT License

5 votes

def _calculate_word_scores(self, word_list):
    """Quick and dirty, inspired by Sujit Pal's RAKE implementation.
    """
    word_freq = nltk.FreqDist()
    for word in word_list:
      word_freq[word] += 1
    
    word_scores = {k:v for k, v in word_freq.items() if v > 0}
    return word_scores

  # def _get_mt_median(self, word_scores):
  #   median = numpy.median([v for k, v in word_scores.items()])
  #   return {k: v for k, v in word_scores.items() if v > median}

Source File: vocab.py From quick-nlp with MIT License

5 votes

def __init__(self, tokens: List[Tokens], special_symbols: List[str] = None):
        special_symbols = [] if special_symbols is None else special_symbols
        special_symbols = special_symbols + ["<eot>", "<response>", "<eos>", "<unk>", "<pad>", "<bos>"]
        self.vocab = FreqDist()
        self.cdf = 0.
        for sample in tokens:
            for token in sample:
                if token not in special_symbols:
                    self.vocab[token] += 1

        print(f"total samples in vocab: {self.vocab.N()}, total tokens in vocab: {self.vocab.B()}")
        self.itos = []
        self.stoi = {}

Source File: nlp.py From DiSAN with Apache License 2.0

5 votes

def dynamic_keep(collect,ratio,fileName=None):

    pdf = dict(nltk.FreqDist(collect))
    pdf = sorted(pdf.items(), key=lambda d: d[1],reverse=True)

    cdf = []
    previous = 0
    # accumulate
    for token, freq in pdf:
        previous += freq
        cdf.append((token, previous))
        # calculate
    for idx, (token, accu) in enumerate(cdf):
        keepAnchor = idx
        if 1.0 * accu / previous >= ratio:  # satisfy the condition
            break

    tokenList=[]
    for idx, (token, freq) in enumerate(pdf):
        if idx > keepAnchor: break
        tokenList.append(token)


    if fileName is not None:
        with open(fileName, 'w') as f:
            for idx, (token, freq) in enumerate(pdf):
                f.write('%d\t%d' % (token, freq))
                f.write(os.linesep)

                if idx == keepAnchor:
                    print(os.linesep*20)

    return tokenList

Source File: nlp.py From DiSAN with Apache License 2.0

5 votes

def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None):
    ratio = float(ratio)
    if add is not None:
        ratio += add
        ratio = ratio if ratio < 1 else 1
    if security:
        ratio = ratio if ratio < 0.99 else 0.99
    def calculate_dynamic_len(pdf ,ratio_ = ratio):
        cdf = []
        previous = 0
        # accumulate
        for len ,freq in pdf:
            previous += freq
            cdf.append((len, previous))
        # calculate
        for len ,accu in cdf:
            if 1.0 * accu/ previous >= ratio_:  # satisfy the condition
                return len, cdf[-1][0]
        # max
        return cdf[-1][0], cdf[-1][0]

    pdf = dict(nltk.FreqDist(lengthList))
    pdf = sorted(pdf.items(), key=lambda d: d[0])

    if fileName is not None:
        with open(fileName, 'w') as f:
            for len, freq in pdf:
                f.write('%d\t%d' % (len, freq))
                f.write(os.linesep)

    return calculate_dynamic_len(pdf, ratio)

Source File: nlp.py From DiSAN with Apache License 2.0

5 votes

def dynamic_keep(collect,ratio,fileName=None):

    pdf = dict(nltk.FreqDist(collect))
    pdf = sorted(pdf.items(), key=lambda d: d[1],reverse=True)

    cdf = []
    previous = 0
    # accumulate
    for token, freq in pdf:
        previous += freq
        cdf.append((token, previous))
        # calculate
    for idx, (token, accu) in enumerate(cdf):
        keepAnchor = idx
        if 1.0 * accu / previous >= ratio:  # satisfy the condition
            break

    tokenList=[]
    for idx, (token, freq) in enumerate(pdf):
        if idx > keepAnchor: break
        tokenList.append(token)


    if fileName is not None:
        with open(fileName, 'w') as f:
            for idx, (token, freq) in enumerate(pdf):
                f.write('%d\t%d' % (token, freq))
                f.write(os.linesep)

                if idx == keepAnchor:
                    print(os.linesep*20)

    return tokenList

Source File: nlp.py From DiSAN with Apache License 2.0

5 votes

def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None):
    ratio = float(ratio)
    if add is not None:
        ratio += add
        ratio = ratio if ratio < 1 else 1
    if security:
        ratio = ratio if ratio < 0.99 else 0.99
    def calculate_dynamic_len(pdf ,ratio_ = ratio):
        cdf = []
        previous = 0
        # accumulate
        for len ,freq in pdf:
            previous += freq
            cdf.append((len, previous))
        # calculate
        for len ,accu in cdf:
            if 1.0 * accu/ previous >= ratio_:  # satisfy the condition
                return len, cdf[-1][0]
        # max
        return cdf[-1][0], cdf[-1][0]

    pdf = dict(nltk.FreqDist(lengthList))
    pdf = sorted(pdf.items(), key=lambda d: d[0])

    if fileName is not None:
        with open(fileName, 'w') as f:
            for len, freq in pdf:
                f.write('%d\t%d' % (len, freq))
                f.write(os.linesep)

    return calculate_dynamic_len(pdf, ratio)

Source File: topics.py From ConvNetPy with MIT License

5 votes

def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics

Python nltk.FreqDist() Examples