Python nltk.collocations.BigramCollocationFinder.from_words() Examples
The following are 13
code examples of nltk.collocations.BigramCollocationFinder.from_words().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.collocations.BigramCollocationFinder
, or try the search function
.
Example #1
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 7 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
Example #2
Source File: test_collocations.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def test_bigram2(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent) #python 2.6 does not have assertItemsEqual or assertListEqual self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'a'), 1), (('a', 'test'), 1), (('is', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'is'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'a'), 1.0), (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('is', 'is'), 1.0), (('test', 'test'), 1.0), (('this', 'is'), 1.0), (('this', 'this'), 1.0)]) ))
Example #3
Source File: test_collocations.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def test_bigram3(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=3) self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'test'), 3), (('is', 'a'), 3), (('this', 'is'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'test'), 1.584962500721156), (('is', 'a'), 1.584962500721156), (('this', 'is'), 1.584962500721156), (('a', 'a'), 0.0), (('is', 'is'), 0.0), (('test', 'test'), 0.0), (('this', 'this'), 0.0)]) ))
Example #4
Source File: test_collocations.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def test_bigram5(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=5) self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'test'), 4), (('is', 'a'), 4), (('this', 'is'), 4), (('is', 'test'), 3), (('this', 'a'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'test'), 1.0), (('is', 'a'), 1.0), (('this', 'is'), 1.0), (('is', 'test'), 0.5849625007211562), (('this', 'a'), 0.5849625007211562), (('a', 'a'), -1.0), (('is', 'is'), -1.0), (('test', 'test'), -1.0), (('this', 'this'), -1.0)]) ))
Example #5
Source File: eval_utils.py From tf-var-attention with MIT License | 6 votes |
def calculate_ngram_diversity(corpus): """ Calculates unigram and bigram diversity Args: corpus: tokenized list of sentences sampled Returns: uni_diversity: distinct-1 score bi_diversity: distinct-2 score """ bigram_finder = BigramCollocationFinder.from_words(corpus) bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N dist = FreqDist(corpus) uni_diversity = len(dist) / len(corpus) return uni_diversity, bi_diversity
Example #6
Source File: text.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print tokenwrap(colloc_strings, separator="; ")
Example #7
Source File: metric.py From MultiTurnDialogZoo with MIT License | 6 votes |
def cal_Distinct(corpus): """ Calculates unigram and bigram diversity Args: corpus: tokenized list of sentences sampled Returns: uni_diversity: distinct-1 score bi_diversity: distinct-2 score """ bigram_finder = BigramCollocationFinder.from_words(corpus) bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N dist = FreqDist(corpus) uni_diversity = len(dist) / len(corpus) return uni_diversity, bi_diversity
Example #8
Source File: load_samples.py From yenlp with GNU General Public License v3.0 | 5 votes |
def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500): '''Find the best n bigrams of a text by means of a give measure.''' words = tokenize(text) bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Example #9
Source File: load_samples.py From yenlp with GNU General Public License v3.0 | 5 votes |
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500): '''Removes the stopwords and computes the best bigrams''' stopset = set(stopwords.words('english')) words = [word for word in tokenize(text) if word not in stopset] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
Example #10
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def collocation_list(self, num=20, window_size=2): """ Return collocations derived from the text, ignoring stopwords. :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( "_collocations" in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words("english") finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) return [w1 + " " + w2 for w1, w2 in self._collocations]
Example #11
Source File: test_collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def test_bigram2(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent) # python 2.6 does not have assertItemsEqual or assertListEqual self.assertEqual( sorted(b.ngram_fd.items()), sorted( [ (('a', 'a'), 1), (('a', 'test'), 1), (('is', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'is'), 1), (('this', 'this'), 1), ] ), ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]), ) self.assertTrue( len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1 ) self.assertTrue( close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted( [ (('a', 'a'), 1.0), (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('is', 'is'), 1.0), (('test', 'test'), 1.0), (('this', 'is'), 1.0), (('this', 'this'), 1.0), ] ), ) )
Example #12
Source File: test_collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def test_bigram3(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=3) self.assertEqual( sorted(b.ngram_fd.items()), sorted( [ (('a', 'test'), 3), (('is', 'a'), 3), (('this', 'is'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1), ] ), ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]), ) self.assertTrue( len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0 ) self.assertTrue( close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted( [ (('a', 'test'), 1.584962500721156), (('is', 'a'), 1.584962500721156), (('this', 'is'), 1.584962500721156), (('a', 'a'), 0.0), (('is', 'is'), 0.0), (('test', 'test'), 0.0), (('this', 'this'), 0.0), ] ), ) )
Example #13
Source File: test_collocations.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def test_bigram5(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=5) self.assertEqual( sorted(b.ngram_fd.items()), sorted( [ (('a', 'test'), 4), (('is', 'a'), 4), (('this', 'is'), 4), (('is', 'test'), 3), (('this', 'a'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1), ] ), ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]), ) self.assertTrue( len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0 ) self.assertTrue( close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted( [ (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('this', 'is'), 1.0), (('is', 'test'), 0.5849625007211562), (('this', 'a'), 0.5849625007211562), (('a', 'a'), -1.0), (('is', 'is'), -1.0), (('test', 'test'), -1.0), (('this', 'this'), -1.0), ] ), ) )