Python nltk.corpus.brown.words() Examples
The following are 30
code examples of nltk.corpus.brown.words().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.brown
, or try the search function
.
Example #1
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 7 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
Example #2
Source File: chunking.py From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License | 6 votes |
def splitter(content, num_of_words): words = content.split(' ') result = [] current_count = 0 current_words = [] for word in words: current_words.append(word) current_count += 1 if current_count == num_of_words: result.append(' '.join(current_words)) current_words = [] current_count = 0 result.append(' '.join(current_words)) return result
Example #3
Source File: phrase-extraction.py From PyRATA with Apache License 2.0 | 6 votes |
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Example #4
Source File: FullNP.py From PyRATA with Apache License 2.0 | 6 votes |
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Example #5
Source File: text.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print tokenwrap(colloc_strings, separator="; ")
Example #6
Source File: chunking.py From Python-Machine-Learning-Cookbook-Second-Edition with MIT License | 6 votes |
def splitter(data, num_words): words = data.split(' ') output = [] cur_count = 0 cur_words = [] for word in words: cur_words.append(word) cur_count += 1 if cur_count == num_words: output.append(' '.join(cur_words)) cur_words = [] cur_count = 0 output.append(' '.join(cur_words) ) return output
Example #7
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd
Example #8
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__(self, tokens, key=lambda x: x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
Example #9
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def collocation_list(self, num=20, window_size=2): """ Return collocations derived from the text, ignoring stopwords. :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( "_collocations" in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words("english") finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) return [w1 + " " + w2 for w1, w2 in self._collocations]
Example #10
Source File: do_benchmark.py From PyRATA with Apache License 2.0 | 5 votes |
def measure_pattern_time_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown words = brown.words()[:%s] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. from pattern.search import search def measure_pattern_search(): global pattern_search_result #Make measure_me able to modify the value pattern_search_result = search("%s", text_tree) #print ("clip.pattern len(result)="+str(len(pattern_search_result))) from timeit import Timer pattern_search_time = Timer(measure_pattern_search) #print ('pattern_search_time') def pattern_search_timeit(): runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)] average = sum(runtimes)/len(runtimes) # return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))]) return [runtimes, average, min(runtimes), len(pattern_search_result)] channel.send(pattern_search_timeit()) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #11
Source File: do_benchmark.py From PyRATA with Apache License 2.0 | 5 votes |
def write_pattern_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown size = %s words = brown.words()[:size] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. def backslash(string): for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']: if ch in string: string=string.replace(ch,'_') return string from pattern.search import search pattern = "%s" pattern_search_result = search(pattern, text_tree) measure_pattern_search() filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern) thefile = open(filename, 'w') for item in pattern_search_result: print>>thefile, item channel.send([filename, size, len(pattern_search_result)]) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #12
Source File: print_english_words.py From adversarial-squad with MIT License | 5 votes |
def main(): freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION) vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]] for w in vocab: print w
Example #13
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores
Example #14
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def find_concordance(self, word, width=80): """ Find all concordance lines given the query word. """ half_width = (width - len(word) - 2) // 2 context = width // 4 # approx number of words of context # Find the instances of the word to create the ConcordanceLine concordance_list = [] offsets = self.offsets(word) if offsets: for i in offsets: query_word = self._tokens[i] # Find the context of query word. left_context = self._tokens[max(0, i - context) : i] right_context = self._tokens[i + 1 : i + context] # Create the pretty lines with the query_word in the middle. left_print = " ".join(left_context)[-half_width:] right_print = " ".join(right_context)[:half_width] # The WYSIWYG line of the concordance. line_print = " ".join([left_print, query_word, right_print]) # Create the ConcordanceLine concordance_line = ConcordanceLine( left_context, query_word, right_context, i, left_print, right_print, line_print, ) concordance_list.append(concordance_line) return concordance_list
Example #15
Source File: spellcheck.py From normalise with GNU General Public License v3.0 | 5 votes |
def words(text): return re.findall('[a-z]+', text.lower())
Example #16
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param words: The words used to seed the similarity search :type words: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if "_word_context_index" not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, key=lambda s: s.lower() ) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e)
Example #17
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words)
Example #18
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, source): if hasattr(source, "words"): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {}
Example #19
Source File: text.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def demo(): from nltk.corpus import brown text = Text(brown.words(categories="news")) print(text) print() print("Concordance:") text.concordance("news") print() print("Distributionally similar words:") text.similar("news") print() print("Collocations:") text.collocations() print() # print("Automatically generated text:") # text.generate() # print() print("Dispersion plot:") text.dispersion_plot(["news", "report", "said", "announced"]) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()["news"])
Example #20
Source File: testingNLP.py From python-urbanPlanning with MIT License | 5 votes |
def splitter(data,num_words): words=data.split(' ') output=[] cur_count=0 cur_words=[] for word in words: cur_words.append(word) cur_count+=1 if cur_count==num_words: output.append(' '.join(cur_words)) cur_words=[] cur_count=0 output.append(' '.join(cur_words)) return output
Example #21
Source File: testingNLP.py From python-urbanPlanning with MIT License | 5 votes |
def splitter(data,num_words): words=data.split(' ') output=[] cur_count=0 cur_words=[] for word in words: cur_words.append(word) cur_count+=1 if cur_count==num_words: output.append(' '.join(cur_words)) cur_words=[] cur_count=0 output.append(' '.join(cur_words)) return output
Example #22
Source File: 3_corpus.py From ml_code with Apache License 2.0 | 5 votes |
def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab.difference(english_vocab) return sorted(unusual)
Example #23
Source File: 3_corpus.py From ml_code with Apache License 2.0 | 5 votes |
def content_fraction(text): stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w.lower() not in stopwords] return len(content) / len(text)
Example #24
Source File: text.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd
Example #25
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd
Example #26
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, tokens, key=lambda x:x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
Example #27
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def print_concordance(self, word, width=75, lines=25): """ Print a concordance for ``word`` with the specified context window. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int """ half_width = (width - len(word) - 2) // 2 context = width // 4 # approx number of words of context offsets = self.offsets(word) if offsets: lines = min(lines, len(offsets)) print("Displaying %s of %s matches:" % (lines, len(offsets))) for i in offsets: if lines <= 0: break left = (' ' * half_width + ' '.join(self._tokens[i-context:i])) right = ' '.join(self._tokens[i+1:i+context]) left = left[-half_width:] right = right[:half_width] print(left, self._tokens[i], right) lines -= 1 else: print("No matches")
Example #28
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s:s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)) except ValueError as e: print(e)
Example #29
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words)
Example #30
Source File: text.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, source): if hasattr(source, 'words'): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {}