Python nltk.corpus.reuters.fileids() Examples
The following are 3
code examples of nltk.corpus.reuters.fileids().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.reuters
, or try the search function
.
Example #1
Source File: setuptextcorpus.py From rnn-speech with MIT License | 5 votes |
def get_corpus_text(): ''' return raw text of reuters corpus ''' return [" ".join(reuters.words(fid)) for fid in reuters.fileids()]
Example #2
Source File: preprocessing.py From Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction with MIT License | 5 votes |
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news if not os.path.isfile('./input/topWords.json'): wordCnt = {} for field in reuters.fileids(): for word in reuters.words(field): word = unify_word(word) if word in nltk.corpus.stopwords.words('english'): continue wordCnt[word] = wordCnt.get(word, 0) + 1 sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True) wordCnt = {} # reset wordCnt for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4) else: return
Example #3
Source File: word_embedding.py From Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction with MIT License | 4 votes |
def get_reuters_data(n_vocab): # return variables sentences = [] word2idx = {'START': 0, 'END': 1} idx2word = ['START', 'END'] current_idx = 2 word_idx_count = {0: float('inf'), 1: float('inf')} tag = 0 for field in reuters.fileids(): sentence = reuters.words(field) tokens = [unify_word(t) for t in sentence] for t in tokens: if t not in word2idx: word2idx[t] = current_idx idx2word.append(t) current_idx += 1 idx = word2idx[t] word_idx_count[idx] = word_idx_count.get(idx, 0) + 1 sentence_by_idx = [word2idx[t] for t in tokens] sentences.append(sentence_by_idx) tag += 1 print(tag) # restrict vocab size sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True) word2idx_small = {} new_idx = 0 idx_new_idx_map = {} for idx, count in sorted_word_idx_count[:n_vocab]: word = idx2word[idx] print word, count word2idx_small[word] = new_idx idx_new_idx_map[idx] = new_idx new_idx += 1 # let 'unknown' be the last token word2idx_small['UNKNOWN'] = new_idx unknown = new_idx # map old idx to new idx sentences_small = [] for sentence in sentences: if len(sentence) > 1: new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence] sentences_small.append(new_sentence) return sentences_small, word2idx_small