Python Examples of nltk.probability.ConditionalFreqDist

Source File: agreement.py From razzy-spinner with GNU General Public License v3.0

6 votes

def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
                for x in self.data
                if x['coder'] in (cA, cB))
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret

Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist(
            (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
        )
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret

Source File: model.py From atap with Apache License 2.0

6 votes

def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

Source File: agreement.py From luscan-devel with GNU General Public License v2.0

6 votes

def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
                for x in self.data
                if x['coder'] in (cA, cB))
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret

Source File: agreement.py From luscan-devel with GNU General Public License v2.0

5 votes

def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae

Source File: counter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def __init__(self, ngram_text=None):
        """Creates a new NgramCounter.

        If `ngram_text` is specified, counts ngrams from it, otherwise waits for
        `update` method to be called explicitly.

        :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
        :type ngram_text: Iterable(Iterable(tuple(str))) or None

        """
        self._counts = defaultdict(ConditionalFreqDist)
        self._counts[1] = self.unigrams = FreqDist()

        if ngram_text:
            self.update(ngram_text)

Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
        )
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
        )

Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae

Source File: text.py From luscan-devel with GNU General Public License v2.0

5 votes

def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                     for i, w in enumerate(tokens))
        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                     for i, w in enumerate(tokens))

Source File: svm_train.py From weiboanalysis with Apache License 2.0

5 votes

def pynlpir_feature(number):  # 选取number个特征词
    normalWords = []
    advWords = []
    for items in read_file('ad/normal.txt'):  # 把集合的集合变成集合
        for item in items:
            normalWords.append(item)
    for items in read_file('ad/advertise.txt'):
        for item in items:
            advWords.append(item)
    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计正常文本中的词频和广告文本中的词频
    for word in normalWords:
        word_fd[word] += 1
        cond_word_fd['normal'][word] += 1
    for word in advWords:
        word_fd[word] += 1
        cond_word_fd['adv'][word] += 1
    normal_word_count = cond_word_fd['normal'].N()  # 正常词的数量
    adv_word_count = cond_word_fd['adv'].N()  # 广告词的数量
    total_word_count = normal_word_count + adv_word_count
    word_scores = {}  # 包括了每个词和这个词的信息量
    for word, freq in word_fd.items():
        # 计算正常词的卡方统计量，这里也可以计算互信息等其它统计量
        normal_score = BigramAssocMeasures.chi_sq(cond_word_fd['normal'][word],
                                                  (freq, normal_word_count),
                                                  total_word_count)
        adv_score = BigramAssocMeasures.chi_sq(cond_word_fd['adv'][word],
                                               (freq, adv_word_count),
                                               total_word_count)  # 同理
        # 一个词的信息量等于正常卡方统计量加上广告卡方统计量
        word_scores[word] = normal_score + adv_score
    best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[
                :number]  # 把词按信息量倒序排序。number是特征的维度，是可以不断调整直至最优的
    # χ²=∑(Oi-Ei)/Ei~χ²(k-1)
    # i=1~k
    # Oi是观测值
    # Ei是期望值
    # 统计量大于临界值时,拒绝原假设

    best_words = set([w for w, s in best_vals])
    return dict([(word, True) for word in best_words])

Source File: sentiment_analysis.py From edx_data_research with MIT License

5 votes

def create_word_scores():
	#creates lists of all positive and negative words
	posWords = []
	negWords = []
	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
		for i in posSentences:
			posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			posWords.append(posWord)
	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
		for i in negSentences:
			negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			negWords.append(negWord)
	posWords = list(itertools.chain(*posWords))
	negWords = list(itertools.chain(*negWords))

	#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
	word_fd = FreqDist()
	cond_word_fd = ConditionalFreqDist()
	for word in posWords:
		word_fd.inc(word.lower())
		cond_word_fd['pos'].inc(word.lower())
	for word in negWords:
		word_fd.inc(word.lower())
		cond_word_fd['neg'].inc(word.lower())

	#finds the number of positive and negative words, as well as the total number of words
	pos_word_count = cond_word_fd['pos'].N()
	neg_word_count = cond_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count
    
    #builds dictionary of word scores based on chi-squared test
	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score

	return word_scores

#finds word scores

Source File: text.py From razzy-spinner with GNU General Public License v3.0

5 votes

def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                     for i, w in enumerate(tokens))
        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                     for i, w in enumerate(tokens))

Source File: agreement.py From razzy-spinner with GNU General Public License v3.0

5 votes

def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae

Source File: ensemble.py From cltk with MIT License

4 votes

def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag) tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        :param verbose: Not used
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if self.backoff is None or tag != self.backoff.tag_one(
                    tokens, index, tags[:index]
                ):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max() # Remove
            weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()]
            hits = fd[context][best_tag] #INT
            if hits > cutoff:
                self._context_to_tag[context] = weighted_tags
                hit_count += hits

Source File: tnt.py From luscan-devel with GNU General Public License v2.0

4 votes

def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

Source File: sequential.py From luscan-devel with GNU General Public License v2.0

4 votes

def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)

######################################################################
#{ Tagger Classes
######################################################################

Source File: sequential.py From razzy-spinner with GNU General Public License v3.0

4 votes

def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))

######################################################################
#{ Tagger Classes
######################################################################

Source File: tnt.py From razzy-spinner with GNU General Public License v3.0

4 votes

def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

Source File: tnt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

4 votes

def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

Python nltk.probability.ConditionalFreqDist() Examples