Python sklearn.feature_extraction.text.ENGLISH_STOP_WORDS Examples
The following are 6
code examples of sklearn.feature_extraction.text.ENGLISH_STOP_WORDS().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.text
, or try the search function
.
Example #1
Source File: test_TermDocMat.py From scattertext with Apache License 2.0 | 4 votes |
def test_get_stoplisted_unigram_corpus(self): tdm = make_a_test_term_doc_matrix() uni_tdm = tdm.get_stoplisted_unigram_corpus() term_df = tdm.get_term_freq_df() uni_term_df = uni_tdm.get_term_freq_df() self.assertEqual(set(term for term in term_df.index if ' ' not in term and "'" not in term and term not in ENGLISH_STOP_WORDS), set(uni_term_df.index)),
Example #2
Source File: test_TermDocMat.py From scattertext with Apache License 2.0 | 4 votes |
def test_allow_single_quotes_in_unigrams(self): tdm = make_a_test_term_doc_matrix() self.assertEqual(type(tdm.allow_single_quotes_in_unigrams()), type(tdm)) uni_tdm = tdm.get_stoplisted_unigram_corpus() term_df = tdm.get_term_freq_df() uni_term_df = uni_tdm.get_term_freq_df() self.assertEqual(set(term for term in term_df.index if ' ' not in term and term not in ENGLISH_STOP_WORDS), set(uni_term_df.index)),
Example #3
Source File: test_TermDocMat.py From scattertext with Apache License 2.0 | 4 votes |
def _assert_stoplisted_minus_joe(self, tdm, uni_tdm): term_df = tdm.get_term_freq_df() uni_term_df = uni_tdm.get_term_freq_df() self.assertEqual(set(term for term in term_df.index if ' ' not in term and 'joe' != term.lower() and "'" not in term and term not in ENGLISH_STOP_WORDS), set(uni_term_df.index)),
Example #4
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words='english') assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) cv.set_params(stop_words='_bad_str_stop_') assert_raises(ValueError, cv.get_stop_words) cv.set_params(stop_words='_bad_unicode_stop_') assert_raises(ValueError, cv.get_stop_words) stoplist = ['some', 'other', 'words'] cv.set_params(stop_words=stoplist) assert_equal(cv.get_stop_words(), set(stoplist))
Example #5
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 4 votes |
def _build_stop_words(self) -> Set[str]: additional_stop_words = self.field.get_vectorizer_stop_words() if additional_stop_words: stop_words = set(ENGLISH_STOP_WORDS) stop_words.update(additional_stop_words) return stop_words else: return ENGLISH_STOP_WORDS
Example #6
Source File: test_text.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words='english') assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) cv.set_params(stop_words='_bad_str_stop_') assert_raises(ValueError, cv.get_stop_words) cv.set_params(stop_words='_bad_unicode_stop_') assert_raises(ValueError, cv.get_stop_words) stoplist = ['some', 'other', 'words'] cv.set_params(stop_words=stoplist) assert_equal(cv.get_stop_words(), set(stoplist))