Python nltk.probability.FreqDist() Examples
The following are 30
code examples of nltk.probability.FreqDist().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.probability
, or try the search function
.
Example #1
Source File: decisiontree.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
Example #2
Source File: decisiontree.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
Example #3
Source File: punkt.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def _freq_threshold(self, fdist, threshold): """ Returns a FreqDist containing only data with counts below a given threshold, as well as a mapping (None -> count_removed). """ # We assume that there is more data below the threshold than above it # and so create a new FreqDist rather than working in place. res = FreqDist() num_removed = 0 for tok in fdist: count = fdist[tok] if count < threshold: num_removed += 1 else: res[tok] += count res[None] += num_removed return res #//////////////////////////////////////////////////////////// #{ Orthographic data #////////////////////////////////////////////////////////////
Example #4
Source File: collocations.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue wfd[w1] += 1 for w2 in window[1:]: if w2 is not None: bfd[(w1, w2)] += 1 return cls(wfd, bfd, window_size=window_size)
Example #5
Source File: eval_utils.py From tf-var-attention with MIT License | 6 votes |
def calculate_ngram_diversity(corpus): """ Calculates unigram and bigram diversity Args: corpus: tokenized list of sentences sampled Returns: uni_diversity: distinct-1 score bi_diversity: distinct-2 score """ bigram_finder = BigramCollocationFinder.from_words(corpus) bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N dist = FreqDist(corpus) uni_diversity = len(dist) / len(corpus) return uni_diversity, bi_diversity
Example #6
Source File: eval_utils.py From tf-var-attention with MIT License | 6 votes |
def calculate_entropy(corpus): """ Calculates diversity in terms of entropy (using unigram probability) Args: corpus: tokenized list of sentences sampled Returns: ent: entropy on the sample sentence list """ fdist = FreqDist(corpus) total_len = len(corpus) ent = 0 for k, v in fdist.items(): p = v / total_len ent += -p * np.log(p) return ent
Example #7
Source File: model.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, n, vocabulary, unknown="<UNK>"): """ n is the size of the ngram """ if n < 1: raise ValueError("ngram size must be greater than or equal to 1") self.n = n self.unknown = unknown self.padding = { "pad_left": True, "pad_right": True, "left_pad_symbol": "<s>", "right_pad_symbol": "</s>" } self.vocabulary = vocabulary self.allgrams = defaultdict(ConditionalFreqDist) self.ngrams = FreqDist() self.unigrams = FreqDist()
Example #8
Source File: transformer.py From atap with Apache License 2.0 | 6 votes |
def transform(self, documents): words = [] docs = [] for document in documents: docs.append(document) for para in document: for sent in para: for token, tag in sent: words.append(token) counts = FreqDist(words) self.reduced = set( w for w in words if counts[w] > self.min and counts[w] < self.max ) return [ ' '.join(self.normalize(doc)) for doc in docs ]
Example #9
Source File: agreement.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) for j, nj in label_freqs.iteritems(): for l, nl in label_freqs.iteritems(): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret
Example #10
Source File: punkt.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def _freq_threshold(self, fdist, threshold): """ Returns a FreqDist containing only data with counts below a given threshold, as well as a mapping (None -> count_removed). """ # We assume that there is more data below the threshold than above it # and so create a new FreqDist rather than working in place. res = FreqDist() num_removed = 0 for tok, count in fdist.iteritems(): if count < threshold: num_removed += 1 else: res.inc(tok, count) res.inc(None, num_removed) return res #//////////////////////////////////////////////////////////// #{ Orthographic data #////////////////////////////////////////////////////////////
Example #11
Source File: collocations.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. By default, bigrams must be contiguous. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError, "Specify window_size at least 2" for window in ingrams(words, window_size, pad_right=True): w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass wfd.inc(w1) for w2 in set(window[1:]): if w2 is not None: bfd.inc((w1, w2)) return cls(wfd, bfd)
Example #12
Source File: decisiontree.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist([label for (featureset,label) in labeled_featuresets]).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist.inc(label) else: neg_fdist.inc(label) decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
Example #13
Source File: agreement.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) for j, nj in iteritems(label_freqs): for l, nl in iteritems(label_freqs): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret
Example #14
Source File: sentiment_analyzer.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def unigram_word_feats(self, words, top_n=None, min_freq=0): """ Return most common top_n word features. :param words: a list of words/tokens. :param top_n: number of best words/tokens to use, sorted by frequency. :rtype: list(str) :return: A list of `top_n` words/tokens (with no duplicates) sorted by frequency. """ # Stopwords are not removed unigram_feats_freqs = FreqDist(word for word in words) return [ w for w, f in unigram_feats_freqs.most_common(top_n) if unigram_feats_freqs[w] > min_freq ]
Example #15
Source File: collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def from_words(cls, words, window_size=3): """Construct a TrigramCollocationFinder for all trigrams in the given sequence. """ if window_size < 3: raise ValueError("Specify window_size at least 3") wfd = FreqDist() wildfd = FreqDist() bfd = FreqDist() tfd = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3 in _itertools.combinations(window[1:], 2): wfd[w1] += 1 if w2 is None: continue bfd[(w1, w2)] += 1 if w3 is None: continue wildfd[(w1, w3)] += 1 tfd[(w1, w2, w3)] += 1 return cls(wfd, bfd, wildfd, tfd)
Example #16
Source File: collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue wfd[w1] += 1 for w2 in window[1:]: if w2 is not None: bfd[(w1, w2)] += 1 return cls(wfd, bfd, window_size=window_size)
Example #17
Source File: punkt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def _freq_threshold(self, fdist, threshold): """ Returns a FreqDist containing only data with counts below a given threshold, as well as a mapping (None -> count_removed). """ # We assume that there is more data below the threshold than above it # and so create a new FreqDist rather than working in place. res = FreqDist() num_removed = 0 for tok in fdist: count = fdist[tok] if count < threshold: num_removed += 1 else: res[tok] += count res[None] += num_removed return res # //////////////////////////////////////////////////////////// # { Orthographic data # ////////////////////////////////////////////////////////////
Example #18
Source File: collocations_app.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def run(self): try: words = self.model.CORPORA[self.name]() from operator import itemgetter text = [w for w in words if len(w) > 2] fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1)) vocab = FreqDist(text) scored = [ ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd ] scored.sort(key=itemgetter(1), reverse=True) self.model.collocations = list(map(itemgetter(0), scored)) self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) # def collocations(): # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
Example #19
Source File: read_data.py From CommonSenseMultiHopQA with MIT License | 6 votes |
def get_stop_words_1(data, num_stop_words): total_words = [] for d in data: total_words.extend(d["ques"]) total_words.extend(d["answer1"]) for d_i in d["summary"]: total_words.extend(d_i) fdist = FreqDist(total_words) stop_words = fdist.most_common(num_stop_words) stop_words = [t[0] for t in stop_words] pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] filtered_stop_words = [] for p in stop_words: if p not in pronoun_list: filtered_stop_words.append(p) return filtered_stop_words
Example #20
Source File: general.py From CommonSenseMultiHopQA with MIT License | 6 votes |
def sample_relations_top_n(graph, context, type_): num_total_words = len(context) dist = FreqDist(context) for node in graph: node = build_score_per_layer(node, dist, num_total_words) for node in graph: node = calc_top_n_score_by_level(node) for i, node in enumerate(graph): graph[i] = prune_graph_by_top_n_softmax(node) selected_paths = select_paths(graph) paths = build_subpaths(selected_paths) final_paths = list(paths for paths, _ in itertools.groupby(paths)) random.shuffle(final_paths) return final_paths
Example #21
Source File: metric.py From MultiTurnDialogZoo with MIT License | 6 votes |
def cal_Distinct(corpus): """ Calculates unigram and bigram diversity Args: corpus: tokenized list of sentences sampled Returns: uni_diversity: distinct-1 score bi_diversity: distinct-2 score """ bigram_finder = BigramCollocationFinder.from_words(corpus) bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N dist = FreqDist(corpus) uni_diversity = len(dist) / len(corpus) return uni_diversity, bi_diversity
Example #22
Source File: collocations.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def from_words(cls, words): """Construct a TrigramCollocationFinder for all trigrams in the given sequence. """ wfd = FreqDist() wildfd = FreqDist() bfd = FreqDist() tfd = FreqDist() for w1, w2, w3 in ingrams(words, 3, pad_right=True): wfd.inc(w1) if w2 is None: continue bfd.inc((w1, w2)) if w3 is None: continue wildfd.inc((w1, w3)) tfd.inc((w1, w2, w3)) return cls(wfd, bfd, wildfd, tfd)
Example #23
Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def pi(self): """Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988). """ total = 0.0 label_freqs = FreqDist(x['labels'] for x in self.data) for k, f in iteritems(label_freqs): total += f ** 2 Ae = total / ((len(self.I) * len(self.C)) ** 2) return (self.avg_Ao() - Ae) / (1 - Ae)
Example #24
Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def alpha(self): """Krippendorff 1980 """ # check for degenerate cases if len(self.K) == 0: raise ValueError("Cannot calculate alpha, no data present!") if len(self.K) == 1: log.debug("Only one annotation value, allpha returning 1.") return 1 if len(self.C) == 1 and len(self.I) == 1: raise ValueError("Cannot calculate alpha, only one coder and item present!") total_disagreement = 0.0 total_ratings = 0 all_valid_labels_freq = FreqDist([]) total_do = 0.0 # Total observed disagreement for all items. for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) labels_count = sum(label_freqs.values()) if labels_count < 2: # Ignore the item. continue all_valid_labels_freq += label_freqs total_do += self.Disagreement(label_freqs) * labels_count do = total_do / sum(all_valid_labels_freq.values()) de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. k_alpha = 1.0 - do / de return k_alpha
Example #25
Source File: read_data.py From CommonSenseMultiHopQA with MIT License | 5 votes |
def get_stop_words(total_words, num_stop_words): fdist = FreqDist(total_words) stop_words = fdist.most_common(num_stop_words) stop_words = [t[0] for t in stop_words] pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] filtered_stop_words = [] for p in stop_words: if p not in pronoun_list: filtered_stop_words.append(p) return filtered_stop_words
Example #26
Source File: general.py From CommonSenseMultiHopQA with MIT License | 5 votes |
def build_trees_one_hop(definitions, query, freq_words, context): context_string = ' '.join(context) num_total_words = len(context) query = [q.lower() for q in query if (q not in freq_words and q in definitions)] dist = FreqDist(context) graph = [] for q in query: for (rel, w_2) in definitions[q]: if check_context(w_2, q, context_string, context, freq_words): new_vertex_1, graph, parent_vertex = is_new_vertex(graph, q, w_2, rel, 1, None) if new_vertex_1: vertex_1 = create_vertex(q, w_2, None, 1, rel, definitions) graph.append(vertex_1) return graph
Example #27
Source File: recall_model.py From Customer-Chatbot with MIT License | 5 votes |
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ",fDist.N()) print("不同单词数: ",fDist.B()) fDist.plot(10)
Example #28
Source File: collocations.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def _apply_filter(self, fn=lambda ngram, freq: False): """Generic filter removes ngrams from the frequency distribution if the function returns True when passed an ngram tuple. """ tmp_ngram = FreqDist() for ngram, freq in iteritems(self.ngram_fd): if not fn(ngram, freq): tmp_ngram[ngram] = freq self.ngram_fd = tmp_ngram
Example #29
Source File: recall_model.py From Customer-Chatbot with MIT License | 5 votes |
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ",fDist.N()) print("不同单词数: ",fDist.B()) fDist.plot(10)
Example #30
Source File: collocations.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def _ngram_freqdist(words, n): return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))