Python sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS Examples
The following are 9
code examples of sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.stop_words
, or try the search function
.
Example #1
Source File: nmf_context_extractor.py From yelp with GNU Lesser General Public License v2.1 | 7 votes |
def build_document_term_matrix(self): self.tfidf_vectorizer = TfidfVectorizer( stop_words=ENGLISH_STOP_WORDS, lowercase=True, strip_accents="unicode", use_idf=True, norm="l2", min_df=Constants.MIN_DICTIONARY_WORD_COUNT, max_df=Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range=(1, 1)) self.document_term_matrix = \ self.tfidf_vectorizer.fit_transform(self.target_bows) vocabulary = self.tfidf_vectorizer.vocabulary_ num_terms = len(vocabulary) self.terms = [""] * num_terms for term in vocabulary.keys(): self.terms[vocabulary[term]] = term print "Created document-term matrix of size %d x %d" % ( self.document_term_matrix.shape[0], self.document_term_matrix.shape[1] )
Example #2
Source File: lexicon_helper.py From linguistic-style-transfer with Apache License 2.0 | 5 votes |
def get_stopwords(): nltk_stopwords = set(stopwords.words('english')) sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS all_stopwords = set() all_stopwords |= spacy_stopwords all_stopwords |= nltk_stopwords all_stopwords |= sklearn_stopwords return all_stopwords
Example #3
Source File: STFIWF.py From 2016CCF-sougou with Apache License 2.0 | 5 votes |
def _check_stop_list(stop): if stop == "english": return ENGLISH_STOP_WORDS elif isinstance(stop, six.string_types): raise ValueError("not a built-in stop list: %s" % stop) elif stop is None: return None else: # assume it's a collection return frozenset(stop)
Example #4
Source File: sklearn_intent_classifer.py From ai-chatbot-framework with MIT License | 5 votes |
def __init__(self): self.model = None self.spacynlp = spacy.load('en') self.stopwords = set(STOP_WORDS + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS)) self.punctuations = " ".join(string.punctuation).split(" ") + \ ["-----", "---", "...", "'ve"]
Example #5
Source File: STFIWF.py From 2016_CCFsougou2 with MIT License | 5 votes |
def _check_stop_list(stop): if stop == "english": return ENGLISH_STOP_WORDS elif isinstance(stop, six.string_types): raise ValueError("not a built-in stop list: %s" % stop) elif stop is None: return None else: # assume it's a collection return frozenset(stop)
Example #6
Source File: kaggle18.py From modin with Apache License 2.0 | 5 votes |
def wordCount(text): try: text = text.lower() regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") txt = regex.sub(" ", text) words = [ w for w in txt.split(" ") if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3 ] return len(words) except Exception: return 0
Example #7
Source File: open_ended_coders.py From mpeds with MIT License | 5 votes |
def _loadSpecialWords(self): ''' Load stop words, number prefixes, news agencies, and protest subject words. ''' self.S_PREFIX = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about'] self.P_SUBJ = { 'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures', 'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters', 'counterprotestors'] } self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service'] self.SWS = list(stop_words.ENGLISH_STOP_WORDS)
Example #8
Source File: STFIWF.py From 2016CCF_BDCI_Sougou with MIT License | 5 votes |
def _check_stop_list(stop): if stop == "english": return ENGLISH_STOP_WORDS elif isinstance(stop, six.string_types): raise ValueError("not a built-in stop list: %s" % stop) elif stop is None: return None else: # assume it's a collection return frozenset(stop)
Example #9
Source File: normalize_text.py From altair with Apache License 2.0 | 4 votes |
def normalize_text(raw_text, remove_stop_words=True, only_letters=True, return_list=False, remove_one_char_words=True, **kwargs): ''' Algorithm to convert raw text to a return a clean text string Method modified from code available at: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words Args: raw_text: Original text to clean and normalize remove_stop_words: Boolean value to trigger removal of stop words only_letters: Boolean value to trigger removal of characters that are not letters return_list: Boolean value to trigger return value as a list of words remove_one_char_words: Boolean value to trigger removal of words that are only a single character Returns: clean_text: Either a string or a list of words that has been filtered based on function parameters. ''' # Remove web links clean_text = link_re.sub('', raw_text) # Remove HTML # Suppress UserWarnings from BeautifulSoup due to text with tech info (ex: code, directory structure) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=UserWarning) clean_text = BeautifulSoup(clean_text, "lxml").get_text() # Only keep letters or keep letters and numbers if only_letters: clean_text = letter_re.sub(" ", clean_text) else: clean_text = letter_number_re.sub(" ",clean_text) # Convert to lower case, split into individual words clean_text = clean_text.lower().split() # If numbers are allowed in words, remove candidate words that only contain numbers if not only_letters: clean_text = [w for w in clean_text if not all(i.isdigit() for i in w)] # Remove stop words if remove_stop_words: clean_text = [w for w in clean_text if not w in python_stop_words] clean_text = [w for w in clean_text if not w in ENGLISH_STOP_WORDS] # Remove words that are only a single character in length if remove_one_char_words: clean_text = [w for w in clean_text if len(w)>1] # Return as string or list based on parameters if return_list: return clean_text else: return " ".join(clean_text)