Python Examples of nltk.probability.FreqDist

Source File: decisiontree.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1

        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default)

Source File: decisiontree.py From razzy-spinner with GNU General Public License v3.0

6 votes

def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label)
                         in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1


        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default)

Source File: punkt.py From razzy-spinner with GNU General Public License v3.0

6 votes

def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok in fdist:
            count = fdist[tok]
            if count < threshold:
                num_removed += 1
            else:
                res[tok] += count
        res[None] += num_removed
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #////////////////////////////////////////////////////////////

Source File: collocations.py From razzy-spinner with GNU General Public License v3.0

6 votes

def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

Source File: eval_utils.py From tf-var-attention with MIT License

6 votes

def calculate_ngram_diversity(corpus):
    """
    Calculates unigram and bigram diversity

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score

    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity

Source File: eval_utils.py From tf-var-attention with MIT License

6 votes

def calculate_entropy(corpus):
    """
    Calculates diversity in terms of entropy (using unigram probability)

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        ent: entropy on the sample sentence list

    """
    fdist = FreqDist(corpus)
    total_len = len(corpus)
    ent = 0
    for k, v in fdist.items():
        p = v / total_len

        ent += -p * np.log(p)

    return ent

Source File: model.py From atap with Apache License 2.0

6 votes

def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

Source File: transformer.py From atap with Apache License 2.0

6 votes

def transform(self, documents):
        words = []
        docs = []
        for document in documents:
            docs.append(document)
            for para in document:
                for sent in para:
                    for token, tag in sent:
                        words.append(token)

        counts = FreqDist(words)
        self.reduced = set(
            w for w in words if counts[w] > self.min and counts[w] < self.max
        )

        return [
            ' '.join(self.normalize(doc)) for doc in docs
        ]

Source File: agreement.py From luscan-devel with GNU General Public License v2.0

6 votes

def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in label_freqs.iteritems():
                for l, nl in label_freqs.iteritems():
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret

Source File: punkt.py From luscan-devel with GNU General Public License v2.0

6 votes

def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok, count in fdist.iteritems():
            if count < threshold:
                num_removed += 1
            else:
                res.inc(tok, count)
        res.inc(None, num_removed)
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #////////////////////////////////////////////////////////////

Source File: collocations.py From luscan-devel with GNU General Public License v2.0

6 votes

def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            try:
                window = window[:list(window).index(w1, 1)]
            except ValueError:
                pass
            wfd.inc(w1)
            for w2 in set(window[1:]):
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd)

Source File: decisiontree.py From luscan-devel with GNU General Public License v2.0

6 votes

def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist([label for (featureset,label)
                          in labeled_featuresets]).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist.inc(label)
            else:
                neg_fdist.inc(label)

        decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        default = DecisionTreeClassifier(neg_fdist.max())
        return DecisionTreeClassifier(label, feature_name, decisions, default)

Source File: agreement.py From razzy-spinner with GNU General Public License v3.0

6 votes

def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in iteritems(label_freqs):
                for l, nl in iteritems(label_freqs):
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret

Source File: sentiment_analyzer.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def unigram_word_feats(self, words, top_n=None, min_freq=0):
        """
        Return most common top_n word features.

        :param words: a list of words/tokens.
        :param top_n: number of best words/tokens to use, sorted by frequency.
        :rtype: list(str)
        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
            frequency.
        """
        # Stopwords are not removed
        unigram_feats_freqs = FreqDist(word for word in words)
        return [
            w
            for w, f in unigram_feats_freqs.most_common(top_n)
            if unigram_feats_freqs[w] > min_freq
        ]

Source File: collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd)

Source File: collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

Source File: punkt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok in fdist:
            count = fdist[tok]
            if count < threshold:
                num_removed += 1
            else:
                res[tok] += count
        res[None] += num_removed
        return res

    # ////////////////////////////////////////////////////////////
    # { Orthographic data
    # ////////////////////////////////////////////////////////////

Source File: collocations_app.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def run(self):
            try:
                words = self.model.CORPORA[self.name]()
                from operator import itemgetter

                text = [w for w in words if len(w) > 2]
                fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
                vocab = FreqDist(text)
                scored = [
                    ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
                    for w1, w2 in fd
                ]
                scored.sort(key=itemgetter(1), reverse=True)
                self.model.collocations = list(map(itemgetter(0), scored))
                self.model.queue.put(CORPUS_LOADED_EVENT)
            except Exception as e:
                print(e)
                self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)


# def collocations():
#    colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]

Source File: read_data.py From CommonSenseMultiHopQA with MIT License

6 votes

def get_stop_words_1(data, num_stop_words):
    total_words = []
    for d in data:
       total_words.extend(d["ques"])
       total_words.extend(d["answer1"])
       for d_i in d["summary"]:
           total_words.extend(d_i)
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words

Source File: general.py From CommonSenseMultiHopQA with MIT License

6 votes

def sample_relations_top_n(graph, context, type_):
    num_total_words = len(context)
    dist = FreqDist(context)
 
    for node in graph:
        node = build_score_per_layer(node, dist, num_total_words)

    for node in graph:
        node = calc_top_n_score_by_level(node)

    for i, node in enumerate(graph):
        graph[i] = prune_graph_by_top_n_softmax(node)

    selected_paths = select_paths(graph) 
    paths = build_subpaths(selected_paths)
    final_paths = list(paths for paths, _ in itertools.groupby(paths))
    random.shuffle(final_paths)
    return final_paths

Source File: metric.py From MultiTurnDialogZoo with MIT License

6 votes

def cal_Distinct(corpus):
    """
    Calculates unigram and bigram diversity
    Args:
        corpus: tokenized list of sentences sampled
    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score
    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity

Source File: collocations.py From luscan-devel with GNU General Public License v2.0

6 votes

def from_words(cls, words):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()

        for w1, w2, w3 in ingrams(words, 3, pad_right=True):
            wfd.inc(w1)
            if w2 is None:
                continue
            bfd.inc((w1, w2))
            if w3 is None:
                continue
            wildfd.inc((w1, w3))
            tfd.inc((w1, w2, w3))
        return cls(wfd, bfd, wildfd, tfd)

Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def pi(self):
        """Scott 1955; here, multi-pi.
        Equivalent to K from Siegel and Castellan (1988).

        """
        total = 0.0
        label_freqs = FreqDist(x['labels'] for x in self.data)
        for k, f in iteritems(label_freqs):
            total += f ** 2
        Ae = total / ((len(self.I) * len(self.C)) ** 2)
        return (self.avg_Ao() - Ae) / (1 - Ae)

Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def alpha(self):
        """Krippendorff 1980

        """
        # check for degenerate cases
        if len(self.K) == 0:
            raise ValueError("Cannot calculate alpha, no data present!")
        if len(self.K) == 1:
            log.debug("Only one annotation value, allpha returning 1.")
            return 1
        if len(self.C) == 1 and len(self.I) == 1:
            raise ValueError("Cannot calculate alpha, only one coder and item present!")

        total_disagreement = 0.0
        total_ratings = 0
        all_valid_labels_freq = FreqDist([])

        total_do = 0.0 # Total observed disagreement for all items.
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)
            labels_count = sum(label_freqs.values())
            if labels_count < 2:
                # Ignore the item.
                continue
            all_valid_labels_freq += label_freqs
            total_do += self.Disagreement(label_freqs) * labels_count

        do = total_do / sum(all_valid_labels_freq.values())

        de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
        k_alpha = 1.0 - do / de

        return k_alpha

Source File: read_data.py From CommonSenseMultiHopQA with MIT License

5 votes

def get_stop_words(total_words, num_stop_words):
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words

Source File: general.py From CommonSenseMultiHopQA with MIT License

5 votes

def build_trees_one_hop(definitions, query, freq_words, context):
    context_string = ' '.join(context)
    num_total_words = len(context)
    query = [q.lower() for q in query if (q not in freq_words and q in definitions)]
    dist = FreqDist(context)
    graph = []

    for q in query:
        for (rel, w_2) in definitions[q]:
            if check_context(w_2, q, context_string, context, freq_words):
                new_vertex_1, graph, parent_vertex = is_new_vertex(graph, q, w_2, rel, 1, None)
                if new_vertex_1:
                    vertex_1 = create_vertex(q, w_2, None, 1, rel, definitions)
                    graph.append(vertex_1)
    return graph

Source File: recall_model.py From Customer-Chatbot with MIT License

5 votes

def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ",fDist.N())
    print("不同单词数: ",fDist.B())
    fDist.plot(10)

Source File: collocations.py From razzy-spinner with GNU General Public License v3.0

5 votes

def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram

Source File: recall_model.py From Customer-Chatbot with MIT License

5 votes

def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ",fDist.N())
    print("不同单词数: ",fDist.B())
    fDist.plot(10)

Source File: collocations.py From razzy-spinner with GNU General Public License v3.0

5 votes

def _ngram_freqdist(words, n):
        return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))

Python nltk.probability.FreqDist() Examples