Python nltk.FreqDist() Examples
The following are 30
code examples of nltk.FreqDist().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: utils.py From BERT with Apache License 2.0 | 8 votes |
def bigram_counts(word_list): bgs = nltk.bigrams(word_list) fdist = nltk.FreqDist(bgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example #2
Source File: analysis.py From DiSAN with Apache License 2.0 | 7 votes |
def do_analysis(dataset_obj): # 1. all sample classification distribution # 2. all sentence sample classification distribution sample_num = dataset_obj.sample_num collect = [] sent_collect = [] for trees in dataset_obj.nn_data: for sample in trees: sentiment_float = sample['root_node']['sentiment_label'] sentiment_int = cfg.sentiment_float_to_int(sentiment_float) if sample['is_sent']: sent_collect.append(sentiment_int) collect.append(sentiment_int) all_pdf = nltk.FreqDist(collect) sent_pdf = nltk.FreqDist(sent_collect) print('sample_num:', sample_num) print('all') print(all_pdf.tabulate()) print('sent') print(sent_pdf.tabulate())
Example #3
Source File: similarity.py From ConvNetPy with MIT License | 7 votes |
def test(): gt = GetTweets() documents = gt.get_hashtag('ferguson', count=20) documents += gt.get_hashtag('police', count=21) print 'Query:', documents[-1] tokenizer = RegexpTokenizer('\w+') vols = [] for doc in documents: samples = [] for token in tokenizer.tokenize(doc): word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) vols.append(volumize(FreqDist(samples))) vectors = [ doc_code(v) for v in vols[:-1] ] query_vec = doc_code(vols[-1]) sims = [ cos(v, query_vec) for v in vectors ] m = max(sims) print m, documents[sims.index(m)]
Example #4
Source File: lexicon.py From CrisisLex with MIT License | 6 votes |
def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs): self.terms = terms # the terms used to build the lexicon self.documents = documents self.classes = classes self.terms_frequency = frequency self.terms_frequency_per_class = dict() self.main_class = main_class # the minimum support for a term (i.e., number of documents in the class of interest in order to be considered) self.min_docs = min_docs self.class_occ = dict() for c in class_types: self.terms_frequency_per_class[c]=nltk.FreqDist() self.class_occ[c] = classes.count(c) for i, doc in enumerate(self.documents): cls = self.classes[i] for t in doc: self.terms_frequency_per_class[cls].inc(t) # the scoring functions return the list of discriminative terms for the class of interest according to each metric
Example #5
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): for sent in para: for word, tag in sent: counts['words'] += 1 tokens[word] += 1 # Return data structure with information return { 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), }
Example #6
Source File: nlp.py From ReSAN with Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example #7
Source File: nlp.py From DiSAN with Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example #8
Source File: similarity.py From ConvNetPy with MIT License | 6 votes |
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: x = volumize(dist) data.append((x, x.w)) return data
Example #9
Source File: lang_model_2.py From jakaton_feminicidios with MIT License | 6 votes |
def __init__(self, order, alpha, sentences): self.order = order self.alpha = alpha if order > 1: self.backoff = LangModel(order - 1, alpha, sentences) self.lexicon = None else: self.backoff = None self.n = 0 self.ngramFD = nltk.FreqDist() lexicon = set() for sentence in sentences: words = nltk.word_tokenize(sentence) wordNGrams = nltk.ngrams(words, order) for wordNGram in wordNGrams: self.ngramFD[wordNGram] += 1 # self.ngramFD.inc(wordNGram) if order == 1: lexicon.add(wordNGram) self.n += 1 self.v = len(lexicon)
Example #10
Source File: topics.py From ConvNetPy with MIT License | 6 votes |
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, V.w)) return data
Example #11
Source File: nlp.py From BiBloSA with Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example #12
Source File: analysis.py From BiBloSA with Apache License 2.0 | 6 votes |
def do_analysis(dataset_obj): # 1. all sample classification distribution # 2. all sentence sample classification distribution sample_num = dataset_obj.sample_num collect = [] sent_collect = [] for trees in dataset_obj.nn_data: for sample in trees: sentiment_float = sample['root_node']['sentiment_label'] sentiment_int = cfg.sentiment_float_to_int(sentiment_float) if sample['is_sent']: sent_collect.append(sentiment_int) collect.append(sentiment_int) all_pdf = nltk.FreqDist(collect) sent_pdf = nltk.FreqDist(sent_collect) print('sample_num:', sample_num) print('all') print(all_pdf.tabulate()) print('sent') print(sent_pdf.tabulate())
Example #13
Source File: dialogue.py From ConvNetPy with MIT License | 6 votes |
def load_data(): global N, words, labels posts = corpus.xml_posts()[:10000] freqs = [ FreqDist(post.text) for post in posts ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) labels = list(set([ post.get('class') for post in posts ])) data = [] N = len(words) for post, dist in zip(posts, freqs): V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, labels.index(post.get('class')))) return data
Example #14
Source File: nlp.py From BiBloSA with Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example #15
Source File: tldr.py From SML-Cogs with MIT License | 6 votes |
def _calculate_word_scores(self, phrase_list): word_freq = nltk.FreqDist() word_degree = nltk.FreqDist() for phrase in phrase_list: # degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1 # SML above cost error degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1 for word in phrase: # word_freq.inc(word) # SML error above: word_freq[word] += 1 # word_degree.inc(word, degree) # other words word_degree[word] = degree for word in word_freq.keys(): word_degree[word] = word_degree[word] + word_freq[word] # itself # word score = deg(w) / freq(w) word_scores = {} for word in word_freq.keys(): word_scores[word] = word_degree[word] / word_freq[word] return word_scores
Example #16
Source File: Trainer.py From truecase with Apache License 2.0 | 5 votes |
def __init__(self): self.uni_dist = nltk.FreqDist() self.backward_bi_dist = nltk.FreqDist() self.forward_bi_dist = nltk.FreqDist() self.trigram_dist = nltk.FreqDist() self.word_casing_lookup = {}
Example #17
Source File: nlp.py From BiBloSA with Apache License 2.0 | 5 votes |
def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None): ratio = float(ratio) if add is not None: ratio += add ratio = ratio if ratio < 1 else 1 if security: ratio = ratio if ratio < 0.99 else 0.99 def calculate_dynamic_len(pdf ,ratio_ = ratio): cdf = [] previous = 0 # accumulate for len ,freq in pdf: previous += freq cdf.append((len, previous)) # calculate for len ,accu in cdf: if 1.0 * accu/ previous >= ratio_: # satisfy the condition return len, cdf[-1][0] # max return cdf[-1][0], cdf[-1][0] pdf = dict(nltk.FreqDist(lengthList)) pdf = sorted(pdf.items(), key=lambda d: d[0]) if fileName is not None: with open(fileName, 'w') as f: for len, freq in pdf: f.write('%d\t%d' % (len, freq)) f.write(os.linesep) return calculate_dynamic_len(pdf, ratio)
Example #18
Source File: data.py From artificial_neural_networks with Apache License 2.0 | 5 votes |
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) # index2word index2word = ['_'] + [UNK] + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
Example #19
Source File: data.py From artificial_neural_networks with Apache License 2.0 | 5 votes |
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) # index2word index2word = ['_'] + [UNK] + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
Example #20
Source File: print_english_words.py From adversarial-squad with MIT License | 5 votes |
def main(): freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION) vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]] for w in vocab: print w
Example #21
Source File: nlp.py From BiBloSA with Apache License 2.0 | 5 votes |
def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None): ratio = float(ratio) if add is not None: ratio += add ratio = ratio if ratio < 1 else 1 if security: ratio = ratio if ratio < 0.99 else 0.99 def calculate_dynamic_len(pdf ,ratio_ = ratio): cdf = [] previous = 0 # accumulate for len ,freq in pdf: previous += freq cdf.append((len, previous)) # calculate for len ,accu in cdf: if 1.0 * accu/ previous >= ratio_: # satisfy the condition return len, cdf[-1][0] # max return cdf[-1][0], cdf[-1][0] pdf = dict(nltk.FreqDist(lengthList)) pdf = sorted(pdf.items(), key=lambda d: d[0]) if fileName is not None: with open(fileName, 'w') as f: for len, freq in pdf: f.write('%d\t%d' % (len, freq)) f.write(os.linesep) return calculate_dynamic_len(pdf, ratio)
Example #22
Source File: adaptive_collect.py From CrisisLex with MIT License | 5 votes |
def set_adaptive(self, lex, learning_time=3, use_hashtags=True, new_terms_no=10): self.adaptive = True self.lex_set = set(lex) self.terms_no = new_terms_no self.use_hashtags = use_hashtags self.start_time = datetime.datetime.now() self.end_time = self.start_time + datetime.timedelta(hours=learning_time) print "Learning interval between %s to %s"%(self.start_time,self.end_time) self.terms_fd = nltk.FreqDist() self.terms = []
Example #23
Source File: nlp.py From BiBloSA with Apache License 2.0 | 5 votes |
def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None): ratio = float(ratio) if add is not None: ratio += add ratio = ratio if ratio < 1 else 1 if security: ratio = ratio if ratio < 0.99 else 0.99 def calculate_dynamic_len(pdf ,ratio_ = ratio): cdf = [] previous = 0 # accumulate for len ,freq in pdf: previous += freq cdf.append((len, previous)) # calculate for len ,accu in cdf: if 1.0 * accu/ previous >= ratio_: # satisfy the condition return len, cdf[-1][0] # max return cdf[-1][0], cdf[-1][0] pdf = dict(nltk.FreqDist(lengthList)) pdf = sorted(pdf.items(), key=lambda d: d[0]) if fileName is not None: with open(fileName, 'w') as f: for len, freq in pdf: f.write('%d\t%d' % (len, freq)) f.write(os.linesep) return calculate_dynamic_len(pdf, ratio)
Example #24
Source File: ner.py From metadoc with MIT License | 5 votes |
def _calculate_word_scores(self, word_list): """Quick and dirty, inspired by Sujit Pal's RAKE implementation. """ word_freq = nltk.FreqDist() for word in word_list: word_freq[word] += 1 word_scores = {k:v for k, v in word_freq.items() if v > 0} return word_scores # def _get_mt_median(self, word_scores): # median = numpy.median([v for k, v in word_scores.items()]) # return {k: v for k, v in word_scores.items() if v > median}
Example #25
Source File: vocab.py From quick-nlp with MIT License | 5 votes |
def __init__(self, tokens: List[Tokens], special_symbols: List[str] = None): special_symbols = [] if special_symbols is None else special_symbols special_symbols = special_symbols + ["<eot>", "<response>", "<eos>", "<unk>", "<pad>", "<bos>"] self.vocab = FreqDist() self.cdf = 0. for sample in tokens: for token in sample: if token not in special_symbols: self.vocab[token] += 1 print(f"total samples in vocab: {self.vocab.N()}, total tokens in vocab: {self.vocab.B()}") self.itos = [] self.stoi = {}
Example #26
Source File: nlp.py From DiSAN with Apache License 2.0 | 5 votes |
def dynamic_keep(collect,ratio,fileName=None): pdf = dict(nltk.FreqDist(collect)) pdf = sorted(pdf.items(), key=lambda d: d[1],reverse=True) cdf = [] previous = 0 # accumulate for token, freq in pdf: previous += freq cdf.append((token, previous)) # calculate for idx, (token, accu) in enumerate(cdf): keepAnchor = idx if 1.0 * accu / previous >= ratio: # satisfy the condition break tokenList=[] for idx, (token, freq) in enumerate(pdf): if idx > keepAnchor: break tokenList.append(token) if fileName is not None: with open(fileName, 'w') as f: for idx, (token, freq) in enumerate(pdf): f.write('%d\t%d' % (token, freq)) f.write(os.linesep) if idx == keepAnchor: print(os.linesep*20) return tokenList
Example #27
Source File: nlp.py From DiSAN with Apache License 2.0 | 5 votes |
def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None): ratio = float(ratio) if add is not None: ratio += add ratio = ratio if ratio < 1 else 1 if security: ratio = ratio if ratio < 0.99 else 0.99 def calculate_dynamic_len(pdf ,ratio_ = ratio): cdf = [] previous = 0 # accumulate for len ,freq in pdf: previous += freq cdf.append((len, previous)) # calculate for len ,accu in cdf: if 1.0 * accu/ previous >= ratio_: # satisfy the condition return len, cdf[-1][0] # max return cdf[-1][0], cdf[-1][0] pdf = dict(nltk.FreqDist(lengthList)) pdf = sorted(pdf.items(), key=lambda d: d[0]) if fileName is not None: with open(fileName, 'w') as f: for len, freq in pdf: f.write('%d\t%d' % (len, freq)) f.write(os.linesep) return calculate_dynamic_len(pdf, ratio)
Example #28
Source File: nlp.py From DiSAN with Apache License 2.0 | 5 votes |
def dynamic_keep(collect,ratio,fileName=None): pdf = dict(nltk.FreqDist(collect)) pdf = sorted(pdf.items(), key=lambda d: d[1],reverse=True) cdf = [] previous = 0 # accumulate for token, freq in pdf: previous += freq cdf.append((token, previous)) # calculate for idx, (token, accu) in enumerate(cdf): keepAnchor = idx if 1.0 * accu / previous >= ratio: # satisfy the condition break tokenList=[] for idx, (token, freq) in enumerate(pdf): if idx > keepAnchor: break tokenList.append(token) if fileName is not None: with open(fileName, 'w') as f: for idx, (token, freq) in enumerate(pdf): f.write('%d\t%d' % (token, freq)) f.write(os.linesep) if idx == keepAnchor: print(os.linesep*20) return tokenList
Example #29
Source File: nlp.py From DiSAN with Apache License 2.0 | 5 votes |
def dynamic_length(lengthList, ratio, add=None, security = True, fileName=None): ratio = float(ratio) if add is not None: ratio += add ratio = ratio if ratio < 1 else 1 if security: ratio = ratio if ratio < 0.99 else 0.99 def calculate_dynamic_len(pdf ,ratio_ = ratio): cdf = [] previous = 0 # accumulate for len ,freq in pdf: previous += freq cdf.append((len, previous)) # calculate for len ,accu in cdf: if 1.0 * accu/ previous >= ratio_: # satisfy the condition return len, cdf[-1][0] # max return cdf[-1][0], cdf[-1][0] pdf = dict(nltk.FreqDist(lengthList)) pdf = sorted(pdf.items(), key=lambda d: d[0]) if fileName is not None: with open(fileName, 'w') as f: for len, freq in pdf: f.write('%d\t%d' % (len, freq)) f.write(os.linesep) return calculate_dynamic_len(pdf, ratio)
Example #30
Source File: topics.py From ConvNetPy with MIT License | 5 votes |
def test(): global N, words, network print 'In testing.' gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth.""" tokenizer = RegexpTokenizer('\w+') gettysburg_tokens = tokenizer.tokenize(gettysburg) samples = [] for token in gettysburg_tokens: word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) dist = FreqDist(samples) V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) pred = network.forward(V).w topics = [] while len(topics) != 5: max_act = max(pred) topic_idx = pred.index(max_act) topic = words[topic_idx] if topic in gettysburg_tokens: topics.append(topic) del pred[topic_idx] print 'Topics of the Gettysburg Address:' print topics