Python sklearn.feature_extraction.text.HashingVectorizer() Examples
The following are 27
code examples of sklearn.feature_extraction.text.HashingVectorizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.text
, or try the search function
.
Example #1
Source File: vectorizer.py From robotreviewer with GNU General Public License v3.0 | 6 votes |
def transform(self, X_si, high=None, low=None, limit=None): """ Same as HashingVectorizer transform, except allows for interaction list, which is an iterable the same length as X filled with True/False. This method adds an empty row to docs labelled as False. """ analyzer = self.build_analyzer() X = self._get_hasher().transform( analyzer(self._deal_with_input(doc)) for doc in X_si) X.data.fill(1) if self.norm is not None: X = normalize(X, norm=self.norm, copy=False) if low: X = self._limit_features(X, low=low) return X
Example #2
Source File: pico_robot.py From robotreviewer with GNU General Public License v3.0 | 6 votes |
def __init__(self): self.vectorizer = HashingVectorizer(ngram_range=(1, 2)) self.dict_vectorizer = DictVectorizer() # These are set dynamically in training # but fixed here to match the end feature names # in the trained model. If the model is retrained then # these may have to change self.dict_vectorizer.feature_names_ = [ 'DocumentPositionQuintile0', 'DocumentPositionQuintile1', 'DocumentPositionQuintile2', 'DocumentPositionQuintile3', 'DocumentPositionQuintile4', 'DocumentPositionQuintile5', 'DocumentPositionQuintile6'] self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)} self.drugbank = Drugbank()
Example #3
Source File: test_text.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_pickling_vectorizer(): instances = [ HashingVectorizer(), HashingVectorizer(norm='l1'), HashingVectorizer(binary=True), HashingVectorizer(ngram_range=(1, 2)), CountVectorizer(), CountVectorizer(preprocessor=strip_tags), CountVectorizer(analyzer=lazy_analyze), CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), TfidfVectorizer(), TfidfVectorizer(analyzer=lazy_analyze), TfidfVectorizer().fit(JUNK_FOOD_DOCS), ] for orig in instances: s = pickle.dumps(orig) copy = pickle.loads(s) assert_equal(type(copy), orig.__class__) assert_equal(copy.get_params(), orig.get_params()) assert_array_equal( copy.fit_transform(JUNK_FOOD_DOCS).toarray(), orig.fit_transform(JUNK_FOOD_DOCS).toarray())
Example #4
Source File: bow.py From broca with MIT License | 6 votes |
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False): """ `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ # Wrap the specified tokenizer t = Tokenizer(tokenizer()) if hash: vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t) else: vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df) args = [ ('vectorizer', vectr), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('normalizer', Normalizer(copy=False)) ] self.pipeline = Pipeline(args) self.trained = False
Example #5
Source File: test_text.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_hashed_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = ['aaabc', 'abbde'] vect = HashingVectorizer(analyzer='char', non_negative=True, norm=None) X = vect.transform(test_data) assert_equal(np.max(X[0:1].data), 3) assert_equal(np.max(X[1:2].data), 2) assert_equal(X.dtype, np.float64) # using boolean features, we can fetch the binary occurrence info # instead. vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True, norm=None) X = vect.transform(test_data) assert_equal(np.max(X.data), 1) assert_equal(X.dtype, np.float64) # check the ability to change the dtype vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True, norm=None, dtype=np.float64) X = vect.transform(test_data) assert_equal(X.dtype, np.float64)
Example #6
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_hashed_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = ['aaabc', 'abbde'] vect = HashingVectorizer(alternate_sign=False, analyzer='char', norm=None) X = vect.transform(test_data) assert_equal(np.max(X[0:1].data), 3) assert_equal(np.max(X[1:2].data), 2) assert_equal(X.dtype, np.float64) # using boolean features, we can fetch the binary occurrence info # instead. vect = HashingVectorizer(analyzer='char', alternate_sign=False, binary=True, norm=None) X = vect.transform(test_data) assert_equal(np.max(X.data), 1) assert_equal(X.dtype, np.float64) # check the ability to change the dtype vect = HashingVectorizer(analyzer='char', alternate_sign=False, binary=True, norm=None, dtype=np.float64) X = vect.transform(test_data) assert_equal(X.dtype, np.float64)
Example #7
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_vectorizer_unicode(): # tests that the count vectorizer works with cyrillic. document = ( "Машинное обучение — обширный подраздел искусственного " "интеллекта, изучающий методы построения алгоритмов, " "способных обучаться." ) vect = CountVectorizer() X_counted = vect.fit_transform([document]) assert_equal(X_counted.shape, (1, 12)) vect = HashingVectorizer(norm=None, alternate_sign=False) X_hashed = vect.transform([document]) assert_equal(X_hashed.shape, (1, 2 ** 20)) # No collisions on such a small dataset assert_equal(X_counted.nnz, X_hashed.nnz) # When norm is None and not alternate_sign, the tokens are counted up to # collisions assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
Example #8
Source File: column_encoders.py From datawig with Apache License 2.0 | 6 votes |
def __init__(self, input_columns: Any, output_column: str = None, max_tokens: int = 2 ** 18, tokens: str = 'chars', ngram_range: tuple = None, prefixed_concatenation: bool = True) -> None: if ngram_range is None: ngram_range = (1, 3) if tokens == 'words' else (1, 5) ColumnEncoder.__init__(self, input_columns, output_column, int(max_tokens)) if tokens == 'words': self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range) elif tokens == 'chars': self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range, analyzer="char") else: logger.debug( "BowEncoder attribute tokens has to be 'words' or 'chars', defaulting to 'chars'") self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range, analyzer="char") self.prefixed_concatenation = prefixed_concatenation
Example #9
Source File: test_search_2.py From spark-sklearn with Apache License 2.0 | 6 votes |
def test_cv_pipeline(self): pipeline = SKL_Pipeline([ ('vect', SKL_HashingVectorizer(n_features=20)), ('tfidf', SKL_TfidfTransformer(use_idf=False)), ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas() skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
Example #10
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_vectorizer_stop_words_inconsistent(): lstr = "['and', 'll', 've']" message = ('Your stop_words may be inconsistent with your ' 'preprocessing. Tokenizing the stop words generated ' 'tokens %s not in stop_words.' % lstr) for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]: vec.set_params(stop_words=["you've", "you", "you'll", 'AND']) assert_warns_message(UserWarning, message, vec.fit_transform, ['hello world']) # reset stop word validation del vec._stop_words_id assert _check_stop_words_consistency(vec) is False # Only one warning per stop list assert_no_warnings(vec.fit_transform, ['hello world']) assert _check_stop_words_consistency(vec) is None # Test caching of inconsistency assessment vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND']) assert_warns_message(UserWarning, message, vec.fit_transform, ['hello world'])
Example #11
Source File: common_utils.py From interpret-community with MIT License | 5 votes |
def create_multiclass_sparse_newsgroups_data(): remove = ('headers', 'footers', 'quotes') categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] from sklearn.datasets import fetch_20newsgroups ngroups = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) x_train, x_test, y_train, y_validation = train_test_split(ngroups.data, ngroups.target, test_size=0.02, random_state=42) from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) return x_train, x_test, y_train, y_validation, categories, vectorizer
Example #12
Source File: ngrams_handling.py From JaSt with GNU General Public License v3.0 | 5 votes |
def csr_proba_of_n_grams_hash_storage(input_file, tolerance, n, n_features): """ Maps an input file to a CSR matrix containing the frequency of its n-grams. - Production of n-grams and analysis of their frequency (+ normalization); - Each n-grams is mapped to a consistent dimension of a vector space (with an hash, collision possible if n_features is to small) - Storage of the results in a CSR matrix. ------- Parameters: - input_file: str Path of the file to be analysed. - tolerance: str Indicates whether esprima should tolerate a few cases of syntax errors (corresponds to esprima's tolerant option). The values 'true' and 'false' shall be used to enable this tolerant mode. - n: int Stands for the size of the sliding-window which goes through the units contained in the files to be analysed. - n_features: int Size of the resulting vector space. This can be changed in nb_features(n). ------- Returns: - csr_matrix Non-compacted dimension: 1 x n_features; Value: probability of occurrences of an n-gram. - or None if the file could not be parsed. """ tokens_int = tokens.tokens_to_numbers(input_file, tolerance) if tokens_int is not None: corpus = [str(tokens_int)] vectorizer = HashingVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(n, n), norm='l1', alternate_sign=False, n_features=n_features) res = vectorizer.fit_transform(corpus) return res return None
Example #13
Source File: test_simple_imputer.py From datawig with Apache License 2.0 | 5 votes |
def test_hpo_explainable(test_dir, data_frame): from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) for explainable, vectorizer in [(False, HashingVectorizer), (True, TfidfVectorizer)]: imputer = SimpleImputer( input_columns=[feature_col], output_column=label_col, output_path=test_dir, is_explainable=explainable ).fit_hpo(df, num_epochs=3) assert isinstance(imputer.imputer.data_encoders[0].vectorizer, vectorizer)
Example #14
Source File: test_text.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_word_analyzer_unigrams(): for Vectorizer in (CountVectorizer, HashingVectorizer): wa = Vectorizer(strip_accents='ascii').build_analyzer() text = ("J'ai mang\xe9 du kangourou ce midi, " "c'\xe9tait pas tr\xeas bon.") expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi', 'etait', 'pas', 'tres', 'bon'] assert_equal(wa(text), expected) text = "This is a test, really.\n\n I met Harry yesterday." expected = ['this', 'is', 'test', 'really', 'met', 'harry', 'yesterday'] assert_equal(wa(text), expected) wa = Vectorizer(input='file').build_analyzer() text = StringIO("This is a test with a file-like object!") expected = ['this', 'is', 'test', 'with', 'file', 'like', 'object'] assert_equal(wa(text), expected) # with custom preprocessor wa = Vectorizer(preprocessor=uppercase).build_analyzer() text = ("J'ai mang\xe9 du kangourou ce midi, " " c'\xe9tait pas tr\xeas bon.") expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI', 'ETAIT', 'PAS', 'TRES', 'BON'] assert_equal(wa(text), expected) # with custom tokenizer wa = Vectorizer(tokenizer=split_tokenize, strip_accents='ascii').build_analyzer() text = ("J'ai mang\xe9 du kangourou ce midi, " "c'\xe9tait pas tr\xeas bon.") expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,', "c'etait", 'pas', 'tres', 'bon.'] assert_equal(wa(text), expected)
Example #15
Source File: test_text.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_hashing_vectorizer(): v = HashingVectorizer() X = v.transform(ALL_FOOD_DOCS) token_nnz = X.nnz assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # By default the hashed values receive a random sign and l2 normalization # makes the feature values bounded assert_true(np.min(X.data) > -1) assert_true(np.min(X.data) < 0) assert_true(np.max(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) # Check vectorization with some non-default parameters v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1') X = v.transform(ALL_FOOD_DOCS) assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # ngrams generate more non zeros ngrams_nnz = X.nnz assert_true(ngrams_nnz > token_nnz) assert_true(ngrams_nnz < 2 * token_nnz) # makes the feature values bounded assert_true(np.min(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
Example #16
Source File: test_text.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_vectorizer_unicode(): # tests that the count vectorizer works with cyrillic. document = ( "\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0" "\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0" "\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd" "\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7" "\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81" "\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3" "\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0" "\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87" "\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82" "\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80" "\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3" "\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81" "\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 " "\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1" "\x8f.") vect = CountVectorizer() X_counted = vect.fit_transform([document]) assert_equal(X_counted.shape, (1, 15)) vect = HashingVectorizer(norm=None, non_negative=True) X_hashed = vect.transform([document]) assert_equal(X_hashed.shape, (1, 2 ** 20)) # No collisions on such a small dataset assert_equal(X_counted.nnz, X_hashed.nnz) # When norm is None and non_negative, the tokens are counted up to # collisions assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
Example #17
Source File: test_text.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_vectorizer_string_object_as_input(): message = ("Iterable over raw text documents expected, " "string object received.") for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]: assert_raise_message( ValueError, message, vec.fit_transform, "hello world!") assert_raise_message( ValueError, message, vec.fit, "hello world!") assert_raise_message( ValueError, message, vec.transform, "hello world!")
Example #18
Source File: common_utils.py From interpret-community with MIT License | 5 votes |
def create_binary_sparse_newsgroups_data(): categories = ['alt.atheism', 'soc.religion.christian'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) class_names = ['atheism', 'christian'] x_train = newsgroups_train.data x_test = newsgroups_test.data y_train = newsgroups_train.target y_validation = newsgroups_test.target from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) return x_train, x_test, y_train, y_validation, class_names, vectorizer
Example #19
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hashingvectorizer_nan_in_docs(): # np.nan can appear when using pandas to load text fields from a csv file # with missing values. message = "np.nan is an invalid document, expected byte or unicode string." exception = ValueError def func(): hv = HashingVectorizer() hv.fit_transform(['hello world', np.nan, 'hello hello']) assert_raise_message(exception, message, func)
Example #20
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_pickling_vectorizer(): instances = [ HashingVectorizer(), HashingVectorizer(norm='l1'), HashingVectorizer(binary=True), HashingVectorizer(ngram_range=(1, 2)), CountVectorizer(), CountVectorizer(preprocessor=strip_tags), CountVectorizer(analyzer=lazy_analyze), CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), TfidfVectorizer(), TfidfVectorizer(analyzer=lazy_analyze), TfidfVectorizer().fit(JUNK_FOOD_DOCS), ] for orig in instances: s = pickle.dumps(orig) copy = pickle.loads(s) assert_equal(type(copy), orig.__class__) assert_equal(copy.get_params(), orig.get_params()) if IS_PYPY and isinstance(orig, HashingVectorizer): continue else: assert_array_equal( copy.fit_transform(JUNK_FOOD_DOCS).toarray(), orig.fit_transform(JUNK_FOOD_DOCS).toarray())
Example #21
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hashing_vectorizer(): v = HashingVectorizer() X = v.transform(ALL_FOOD_DOCS) token_nnz = X.nnz assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # By default the hashed values receive a random sign and l2 normalization # makes the feature values bounded assert np.min(X.data) > -1 assert np.min(X.data) < 0 assert np.max(X.data) > 0 assert np.max(X.data) < 1 # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) # Check vectorization with some non-default parameters v = HashingVectorizer(ngram_range=(1, 2), norm='l1') X = v.transform(ALL_FOOD_DOCS) assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # ngrams generate more non zeros ngrams_nnz = X.nnz assert ngrams_nnz > token_nnz assert ngrams_nnz < 2 * token_nnz # makes the feature values bounded assert np.min(X.data) > -1 assert np.max(X.data) < 1 # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
Example #22
Source File: rct_robot.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def __init__(self): from keras.preprocessing import sequence from keras.models import load_model from keras.models import Sequential from keras.preprocessing import sequence from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten from keras.layers import Embedding from keras.layers import Convolution1D, MaxPooling1D from keras import backend as K from keras.models import Model from keras.regularizers import l2 global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten global Embedding, Convolution1D, MaxPooling1D, K, Model, l2 self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f) self.calibration_lr = {} with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f) with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn'] = pickle.load(f)
Example #23
Source File: bias_ab_robot.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def __init__(self): with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f: self.clf = pickle.load(f) self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')
Example #24
Source File: pubmed_robot.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def __init__(self): raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz')) self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape']) self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind'] self.vectorizer = HashingVectorizer(binary=True, stop_words='english') # load database self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite')) self.c = self.connection.cursor()
Example #25
Source File: similarity_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128, ngram_range=(3, 3), sparse=False, sample_weight=None, random_state=None): """ Computes prototypes based on: - dimensionality reduction (via hashing n-grams) - k-means clustering - nearest neighbor """ vectorizer = HashingVectorizer(analyzer='char', norm=None, alternate_sign=False, ngram_range=ngram_range, n_features=hashing_dim) projected = vectorizer.transform(X) if not sparse: projected = projected.toarray() kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state) kmeans.fit(projected, sample_weight=sample_weight) centers = kmeans.cluster_centers_ neighbors = NearestNeighbors() neighbors.fit(projected) indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1]) if indexes_prototypes.shape[0] < n_prototypes: warnings.warn('Final number of unique prototypes is lower than ' + 'n_prototypes (expected)') return np.sort(X[indexes_prototypes])
Example #26
Source File: pipeline_builder.py From texta with GNU General Public License v3.0 | 4 votes |
def get_pipeline_builder(): pipe_builder = PipelineBuilder() # Feature Extraction params = {} pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params) params = {'ngram_range': [(1, 1), (1, 2)], 'min_df': [5]} pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params) params = {'ngram_range': [(1, 1), (1, 2)], 'min_df': [5]} pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params) # Dimension Reduction params = {} pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params) params = {} pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params) # Normalization params = {} pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params) params = {} pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params) # Classification Models params = {} pipe_builder.add_classifier('LogisticRegressionClassifier', LogisticRegression, 'Logistic Regression', params) params = {} pipe_builder.add_classifier('LinearSVC', LinearSVC, 'LinearSVC', params) params = {} pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params) params = {} pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params) return pipe_builder
Example #27
Source File: similarity_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 4 votes |
def ngram_similarity(X, cats, ngram_range, hashing_dim, dtype=np.float64): """ Similarity encoding for dirty categorical variables: Given to arrays of strings, returns the similarity encoding matrix of size len(X) x len(cats) ngram_sim(s_i, s_j) = ||min(ci, cj)||_1 / (||ci||_1 + ||cj||_1 - ||min(ci, cj)||_1) """ min_n, max_n = ngram_range unq_X = np.unique(X) cats = np.array([' %s ' % cat for cat in cats]) unq_X_ = np.array([' %s ' % x for x in unq_X]) if not hashing_dim: vectorizer = CountVectorizer(analyzer='char', ngram_range=(min_n, max_n), dtype=dtype) vectorizer.fit(np.concatenate((cats, unq_X_))) else: vectorizer = HashingVectorizer(analyzer='char', ngram_range=(min_n, max_n), n_features=hashing_dim, norm=None, alternate_sign=False, dtype=dtype) vectorizer.fit(X) count_cats = vectorizer.transform(cats) count_X = vectorizer.transform(unq_X_) # We don't need the vectorizer anymore, delete it to save memory del vectorizer sum_cats = np.asarray(count_cats.sum(axis=1)) SE_dict = {} for i, x in enumerate(count_X): _, nonzero_idx, nonzero_vals = sparse.find(x) samegrams = np.asarray( (count_cats[:, nonzero_idx].minimum(nonzero_vals)).sum(axis=1)) allgrams = x.sum() + sum_cats - samegrams similarity = np.divide(samegrams, allgrams) SE_dict[unq_X[i]] = similarity.reshape(-1) # We don't need the counts anymore, delete them to save memory del count_cats, count_X out = np.empty((len(X), similarity.size), dtype=dtype) for x, out_row in zip(X, out): out_row[:] = SE_dict[x] return np.nan_to_num(out, copy=False)