Python sklearn.feature_extraction.FeatureHasher() Examples
The following are 30
code examples of sklearn.feature_extraction.FeatureHasher().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction
, or try the search function
.
Example #1
Source File: pe_imports_features.py From driverlessai-recipes with Apache License 2.0 | 7 votes |
def imports_features(self, lief_binary): from sklearn.feature_extraction import FeatureHasher imports = lief_binary.imports features = {} for lib in imports: if lib.name not in features: features[lib.name] = [] for entry in lib.entries: if entry.is_ordinal: features[lib.name].append("ordinal" + str(entry.ordinal)) else: features[lib.name].append(entry.name[:10000]) features_hashed = {} libraries = sorted(list(set([l.lower() for l in features.keys()]))) for i, x in enumerate(FeatureHasher(256, input_type='string').transform([libraries]).toarray()[0]): features_hashed.update({f'Imports_libraries_hash_{i}': x}) entries = sorted([lib.lower() + ':' + e for lib, elist in features.items() for e in elist]) for i, x in enumerate(FeatureHasher(1024, input_type='string').transform([entries]).toarray()[0]): features_hashed.update({f'Imports_entries_hash_{i}': x}) return features_hashed
Example #2
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_hash_collisions(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, non_negative=False, n_features=1, input_type='string').fit_transform(X) # check that some of the hashed tokens are added # with an opposite sign and cancel out assert abs(Xt.data[0]) < len(X[0]) Xt = FeatureHasher(alternate_sign=True, non_negative=True, n_features=1, input_type='string').fit_transform(X) assert abs(Xt.data[0]) < len(X[0]) Xt = FeatureHasher(alternate_sign=False, non_negative=True, n_features=1, input_type='string').fit_transform(X) assert Xt.data[0] == len(X[0])
Example #3
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) assert Xt.data.min() > 0 Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should # have no impact when alternate_sign=False assert_array_equal(Xt.data, Xt_2.data)
Example #4
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, input_type="string", alternate_sign=False) X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, 6)
Example #5
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_feature_hasher_pairs_with_string_values(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 1], x1_nz) assert_equal([1, 1, 4], x2_nz) raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) x1, x2 = h.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert_equal([1], x1_nz) assert_equal([1], x2_nz) assert_array_equal(x1, x2)
Example #6
Source File: run_model.py From kaggle_avazu_benchmark with Apache License 2.0 | 6 votes |
def main(neg_rate, submission_num, n_iter, train_path): ids = [x for x in pp.get_int_field('id', 'original_data/test')] clicks = pp.get_int_field('click', train_path) # Get Data Generators train = pp.data_generator(pp.clean_parse_row, train_path) test = pp.data_generator(pp.clean_parse_row, 'original_data/test') # Define estimators fh = FeatureHasher(n_features=2 ** 20, input_type='pair') sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2') #Fit pipeline pipeline = ml.PartialFitter([fh, sgd], batch_size=10000, logging=True, n_iter=n_iter, neg_rate=neg_rate) pipeline.partial_fit(X=train, y=clicks) # Correct Intercept pipeline.steps[-1].intercept_[0] += np.log(neg_rate) preds = pipeline.predict_proba(newX=test)[:, 1] pp.write_submission(number=submission_num, ids=ids, preds=preds)
Example #7
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_feature_hasher_pairs_with_string_values(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": u"abc", "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 1], x1_nz) assert_equal([1, 1, 4], x2_nz) raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) x1, x2 = h.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert_equal([1], x1_nz) assert_equal([1], x2_nz) assert_array_equal(x1, x2)
Example #8
Source File: test_feature_extraction.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer) self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher) self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph) self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph) self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d) self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d, fe.image.reconstruct_from_patches_2d) self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor) self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer) self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer) self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer) self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)
Example #9
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, 6)
Example #10
Source File: pefeatures.py From youarespecial with MIT License | 5 votes |
def __call__(self, binary): libraries = [l.lower() for l in binary.libraries] # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry imports = [lib.name.lower() + ':' + e.name for lib in binary.imports for e in lib.entries] # two separate elements: libraries (alone) and fully-qualified names of imported functions return np.concatenate([ FeatureHasher(256, input_type="string", dtype=self.dtype).transform( [libraries]).toarray(), FeatureHasher(1024, input_type="string", dtype=self.dtype).transform( [imports]).toarray() ], axis=-1).flatten().astype(self.dtype)
Example #11
Source File: _machine_learning.py From qlik-py-tools with MIT License | 5 votes |
def hasher(df, col, n_features): """ Hash the unique values in the specified column in the given dataframe, creating n_features """ unique = pd.DataFrame(df[col].unique(), columns=[col]) fh = FeatureHasher(n_features=n_features, input_type="string") hashed = fh.fit_transform(unique.loc[:, col]) unique = unique.join(pd.DataFrame(hashed.toarray()).add_prefix(col)) return unique.set_index(col)
Example #12
Source File: pefeatures.py From youarespecial with MIT License | 5 votes |
def __call__(self, binary): return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)
Example #13
Source File: pefeatures.py From youarespecial with MIT License | 5 votes |
def __call__(self, binary): return np.concatenate([ [[binary.header.time_date_stamps]], FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.header.machine)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.header.characteristics_list]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.subsystem)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.magic)]]).toarray(), [[binary.optional_header.major_image_version]], [[binary.optional_header.minor_image_version]], [[binary.optional_header.major_linker_version]], [[binary.optional_header.minor_linker_version]], [[binary.optional_header.major_operating_system_version]], [[binary.optional_header.minor_operating_system_version]], [[binary.optional_header.major_subsystem_version]], [[binary.optional_header.minor_subsystem_version]], [[binary.optional_header.sizeof_code]], [[binary.optional_header.sizeof_headers]], [[binary.optional_header.sizeof_heap_commit]], ], axis=-1).flatten().astype(self.dtype)
Example #14
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert_equal("dict", h.input_type) raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": u"string1"}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (iter(d.items()) for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) assert_array_equal(X1.toarray(), X2.toarray())
Example #15
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_feature_hasher_pairs(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 2], x1_nz) assert_equal([1, 3, 4], x2_nz)
Example #16
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
Example #17
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_hasher_set_params(): # Test delayed input validation in fit (useful for grid search). hasher = FeatureHasher() hasher.set_params(n_features=np.inf) assert_raises(TypeError, hasher.fit)
Example #18
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{'foo': 0}]) assert_equal(X.data.shape, (0,))
Example #19
Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_hasher_negative(): X = [{"foo": 2, "bar": -4, "baz": -1}.items()] Xt = FeatureHasher(alternate_sign=False, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0) Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type="pair").fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type="pair").fit_transform(X) assert_true(Xt.data.min() > 0)
Example #20
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, input_type='string').fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=False, input_type='string').fit_transform(X) assert Xt.data.min() > 0
Example #21
Source File: pefeatures.py From gym-malware with MIT License | 5 votes |
def __call__(self, binary): libraries = [l.lower() for l in binary.libraries] # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry imports = [lib.name.lower() + ':' + e.name for lib in binary.imports for e in lib.entries] # two separate elements: libraries (alone) and fully-qualified names of imported functions return np.concatenate([ FeatureHasher(256, input_type="string", dtype=self.dtype).transform( [libraries]).toarray(), FeatureHasher(1024, input_type="string", dtype=self.dtype).transform( [imports]).toarray() ], axis=-1).flatten().astype(self.dtype)
Example #22
Source File: pefeatures.py From gym-malware with MIT License | 5 votes |
def __call__(self, binary): return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)
Example #23
Source File: pefeatures.py From gym-malware with MIT License | 5 votes |
def __call__(self, binary): return np.concatenate([ [[binary.header.time_date_stamps]], FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.header.machine)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.header.characteristics_list]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.subsystem)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.magic)]]).toarray(), [[binary.optional_header.major_image_version]], [[binary.optional_header.minor_image_version]], [[binary.optional_header.major_linker_version]], [[binary.optional_header.minor_linker_version]], [[binary.optional_header.major_operating_system_version]], [[binary.optional_header.minor_operating_system_version]], [[binary.optional_header.major_subsystem_version]], [[binary.optional_header.minor_subsystem_version]], [[binary.optional_header.sizeof_code]], [[binary.optional_header.sizeof_headers]], [[binary.optional_header.sizeof_heap_commit]], ], axis=-1).flatten().astype(self.dtype)
Example #24
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert_equal("dict", h.input_type) raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (iter(d.items()) for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) assert_array_equal(X1.toarray(), X2.toarray())
Example #25
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_feature_hasher_pairs(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 2], x1_nz) assert_equal([1, 3, 4], x2_nz)
Example #26
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
Example #27
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hasher_set_params(): # Test delayed input validation in fit (useful for grid search). hasher = FeatureHasher() hasher.set_params(n_features=np.inf) assert_raises(TypeError, hasher.fit)
Example #28
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{'foo': 0}]) assert_equal(X.data.shape, (0,))
Example #29
Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_hash_collisions(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, n_features=1, input_type='string').fit_transform(X) # check that some of the hashed tokens are added # with an opposite sign and cancel out assert abs(Xt.data[0]) < len(X[0]) Xt = FeatureHasher(alternate_sign=False, n_features=1, input_type='string').fit_transform(X) assert Xt.data[0] == len(X[0])
Example #30
Source File: pe_exports_features.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def exports_features(self, lief_binary): from sklearn.feature_extraction import FeatureHasher exports = sorted(lief_binary.exported_functions) features_hashed = {} if exports: for i, x in enumerate(FeatureHasher(128, input_type='string').transform(exports).toarray()[0]): features_hashed.update({f'Exports_functions_hash_{i}': x}) else: for i in range(128): features_hashed.update({f'Exports_functions_hash_{i}': 0}) return features_hashed