Python nltk.probability.ConditionalFreqDist() Examples
The following are 19
code examples of nltk.probability.ConditionalFreqDist().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.probability
, or try the search function
.
Example #1
Source File: agreement.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): """Cohen 1968 """ total = 0.0 label_freqs = ConditionalFreqDist((x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)) for j in self.K: for l in self.K: total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) De = total / (max_distance * pow(len(self.I), 2)) log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) Do = self.Do_Kw_pairwise(cA, cB) ret = 1.0 - (Do / De) return ret
Example #2
Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): """Cohen 1968 """ total = 0.0 label_freqs = ConditionalFreqDist( (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB) ) for j in self.K: for l in self.K: total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) De = total / (max_distance * pow(len(self.I), 2)) log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) Do = self.Do_Kw_pairwise(cA, cB) ret = 1.0 - (Do / De) return ret
Example #3
Source File: model.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, n, vocabulary, unknown="<UNK>"): """ n is the size of the ngram """ if n < 1: raise ValueError("ngram size must be greater than or equal to 1") self.n = n self.unknown = unknown self.padding = { "pad_left": True, "pad_right": True, "left_pad_symbol": "<s>", "right_pad_symbol": "</s>" } self.vocabulary = vocabulary self.allgrams = defaultdict(ConditionalFreqDist) self.ngrams = FreqDist() self.unigrams = FreqDist()
Example #4
Source File: agreement.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): """Cohen 1968 """ total = 0.0 label_freqs = ConditionalFreqDist((x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)) for j in self.K: for l in self.K: total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) De = total / (max_distance * pow(len(self.I), 2)) log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) Do = self.Do_Kw_pairwise(cA, cB) ret = 1.0 - (Do / De) return ret
Example #5
Source File: agreement.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
Example #6
Source File: counter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, ngram_text=None): """Creates a new NgramCounter. If `ngram_text` is specified, counts ngrams from it, otherwise waits for `update` method to be called explicitly. :param ngram_text: Optional text containing senteces of ngrams, as for `update` method. :type ngram_text: Iterable(Iterable(tuple(str))) or None """ self._counts = defaultdict(ConditionalFreqDist) self._counts[1] = self.unigrams = FreqDist() if ngram_text: self.update(ngram_text)
Example #7
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) ) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) )
Example #8
Source File: agreement.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
Example #9
Source File: text.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens))
Example #10
Source File: svm_train.py From weiboanalysis with Apache License 2.0 | 5 votes |
def pynlpir_feature(number): # 选取number个特征词 normalWords = [] advWords = [] for items in read_file('ad/normal.txt'): # 把集合的集合变成集合 for item in items: normalWords.append(item) for items in read_file('ad/advertise.txt'): for item in items: advWords.append(item) word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计正常文本中的词频和广告文本中的词频 for word in normalWords: word_fd[word] += 1 cond_word_fd['normal'][word] += 1 for word in advWords: word_fd[word] += 1 cond_word_fd['adv'][word] += 1 normal_word_count = cond_word_fd['normal'].N() # 正常词的数量 adv_word_count = cond_word_fd['adv'].N() # 广告词的数量 total_word_count = normal_word_count + adv_word_count word_scores = {} # 包括了每个词和这个词的信息量 for word, freq in word_fd.items(): # 计算正常词的卡方统计量,这里也可以计算互信息等其它统计量 normal_score = BigramAssocMeasures.chi_sq(cond_word_fd['normal'][word], (freq, normal_word_count), total_word_count) adv_score = BigramAssocMeasures.chi_sq(cond_word_fd['adv'][word], (freq, adv_word_count), total_word_count) # 同理 # 一个词的信息量等于正常卡方统计量加上广告卡方统计量 word_scores[word] = normal_score + adv_score best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[ :number] # 把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的 # χ²=∑(Oi-Ei)/Ei~χ²(k-1) # i=1~k # Oi是观测值 # Ei是期望值 # 统计量大于临界值时,拒绝原假设 best_words = set([w for w, s in best_vals]) return dict([(word, True) for word in best_words])
Example #11
Source File: sentiment_analysis.py From edx_data_research with MIT License | 5 votes |
def create_word_scores(): #creates lists of all positive and negative words posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) for word in negWords: word_fd.inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores #finds word scores
Example #12
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens))
Example #13
Source File: agreement.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
Example #14
Source File: ensemble.py From cltk with MIT License | 4 votes |
def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag) tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. :param verbose: Not used """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index] ): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() # Remove weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()] hits = fd[context][best_tag] #INT if hits > cutoff: self._context_to_tag[context] = weighted_tags hit_count += hits
Example #15
Source File: tnt.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0
Example #16
Source File: sequential.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning) ###################################################################### #{ Tagger Classes ######################################################################
Example #17
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=' ') print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)) ###################################################################### #{ Tagger Classes ######################################################################
Example #18
Source File: tnt.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0
Example #19
Source File: tnt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0