Python stop_words.get_stop_words() Examples
The following are 17
code examples of stop_words.get_stop_words().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
stop_words
, or try the search function
.
Example #1
Source File: lex_sem_ft.py From DL-text with MIT License | 6 votes |
def LDA_train(doc): red = [] en_stop = get_stop_words('en') for d in doc: try: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] red.append(stopped_tokens) except: continue print("Forming Dictionary.....") dictionary = corpora.Dictionary(red) print("Forming Corpus.....") corpus = [dictionary.doc2bow(text) for text in red] print("Training Model.....") lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1) return lda #Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):
Example #2
Source File: lex_sem_ft.py From DeepLearn with MIT License | 6 votes |
def LDA_train(doc): red = [] en_stop = get_stop_words('en') for d in doc: try: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] red.append(stopped_tokens) except: continue print("Forming Dictionary.....") dictionary = corpora.Dictionary(red) print("Forming Corpus.....") corpus = [dictionary.doc2bow(text) for text in red] print("Training Model.....") lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1) return lda #Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):
Example #3
Source File: scrape2.py From Web-Scraping with MIT License | 6 votes |
def clean_up_words(words): new_words = [] # empty list pkg_stop_words = get_stop_words('en') my_stop_words = [ 'the', 'is', 'and', 'thisfacebooktwitteremailredditprint', '', 'reply', 'likelike', 'likeliked', 'comments', 'commenting', '/', '=' ] for word in words: word = word.lower() cleaned_word = clean_word(word) if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words: pass else: new_words.append(cleaned_word) return new_words
Example #4
Source File: eval_utils.py From dataset_agnostic_segmentation with MIT License | 5 votes |
def phoc_spottig(it, folder, ignore_stop_words=False, iou_thresh=0.5, dont_load=False, use_gt_phoc=False, filter_small=False, logger=None, max_word_num=None): """ Evaluate mAP for phoc based word spotting using an evaluation folder (contating *.json) produced by main.py script. assumes folder contains json files with {'word_%d': {'gt': gt_box, 'text': word annotation, 'cover': IoU of predicted word with GT box, 'gt_phoc': PHOC for text, 'pre_phoc': predicted PHOC}} Performs two tasks: (1) Prepare query words (2) Call query function on query words """ # Create all query words - based on test set ground truth if logger is None: logger = print qwords = [] for page in it: # Some of the datasest (e.g. IAMDB) have words with bad annotations, those are ignored by the eval protocol words = page.get_good_words_and_boxes_idx()[1] qwords.extend(words) qwords = set(qwords) logger('Query Words %d' % len(qwords)) if ignore_stop_words: # If there are stop words to be removed... qwords = qwords - set(get_stop_words('en')) logger('Without stop words %d' % len(qwords)) if max_word_num is not None: # Subsample all queries to partially evaluate if len(qwords) > max_word_num: idx = np.random.choice(range(len(qwords)), max_word_num, replace=False) qwords = set(np.array(list(qwords))[idx]) logger('Sampled %d words' % len(qwords)) qtimer = Timer() qtimer.tic() mAP, recall, accuracy = query_page_folder_phoc(qwords, folder, threshold=iou_thresh, dont_load=dont_load, use_gt_phoc=use_gt_phoc, filter_small=filter_small) logger('Finished after %d secs mAP %4.2f Recall %4.2f Accuracy %4.2f' % (qtimer.toc(), mAP*100, recall*100, accuracy*100)) return
Example #5
Source File: lda_model_calculator.py From moviegeek with MIT License | 5 votes |
def remove_stopwords(tokenized_data): en_stop = get_stop_words('en') stopped_tokens = [token for token in tokenized_data if token not in en_stop] return stopped_tokens
Example #6
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_filters(self): language = 'en' before = get_stop_words(language, False) letter = random.choice(random.choice(before)) def remove_letter(stopwords, language): return [word for word in stopwords if letter not in word] stop_words.add_filter(remove_letter) after = get_stop_words(language, False) for stopword in after: self.assertFalse(letter in stopword) self.assertTrue(stop_words.remove_filter(remove_letter))
Example #7
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_get_stop_words_install_issue(self): original_stop_words_dir = stop_words.STOP_WORDS_DIR stop_words.STOP_WORDS_DIR = 'not-existing-directory' self.assertRaises(StopWordError, get_stop_words, 'german') stop_words.STOP_WORDS_DIR = original_stop_words_dir
Example #8
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_get_stop_words_unavailable_language(self): self.assertRaises(StopWordError, get_stop_words, 'sindarin')
Example #9
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_get_stop_words_cache(self): self.assertFalse('french' in stop_words.STOP_WORDS_CACHE) sw = get_stop_words('fr') self.assertTrue('french' in stop_words.STOP_WORDS_CACHE) original_stop_words_dir = stop_words.STOP_WORDS_DIR stop_words.STOP_WORDS_DIR = 'not-existing-directory' self.assertEqual(sw, get_stop_words('french')) stop_words.STOP_WORDS_DIR = original_stop_words_dir try: get_stop_words('klingon') except: pass self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
Example #10
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_get_stop_words_language_mapping(self): sw = get_stop_words('en') self.assertEqual(len(sw), self.number_of_english_stop_words) self.assertEqual(sw, get_stop_words('english'))
Example #11
Source File: tests.py From python-stop-words with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_get_stop_words(self): sw = get_stop_words('english') self.assertEqual(len(sw), self.number_of_english_stop_words)
Example #12
Source File: index.py From acl-anthology with Apache License 2.0 | 5 votes |
def load_stopwords(language): return [t for w in get_stop_words(language) for t in slugify(w).split("-")]
Example #13
Source File: nlp.py From GraphDash with Apache License 2.0 | 5 votes |
def __init__(self, language): self._stop_words = set(stop_words.get_stop_words(language))
Example #14
Source File: scrape1.py From Web-Scraping with MIT License | 5 votes |
def clean_up_words(words): new_words = [] # empty list pkg_stop_words = get_stop_words('en') my_stop_words = ['the', 'is', 'and', 'thisfacebooktwitteremailredditprint'] for word in words: word = word.lower() cleaned_word = clean_word(word) if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words: pass else: new_words.append(cleaned_word) return new_words
Example #15
Source File: preprocessing.py From TBBTCorpus with Apache License 2.0 | 5 votes |
def __init__(self): self.episodeInfo = {} self.Info = [] self.allTranscripts = {} self.vocabulary = collections.defaultdict(int) self.Stopwords = get_stop_words('en') self.impactActors = ["Leonard","Sheldon","Penny", "Howard","Raj","Amy","Bernadette"]
Example #16
Source File: topic_extractor.py From TBBTCorpus with Apache License 2.0 | 5 votes |
def __remove_stop_words(self, docs): output = [] for doc in docs: en_stop = get_stop_words('en') stopped_tokens = [i for i in doc if not i in en_stop] output.append(stopped_tokens) return output
Example #17
Source File: instance_preprocessing.py From sciwing with MIT License | 5 votes |
def __init__(self): self.stop_words = get_stop_words("en")