Python Examples of sklearn.feature_extraction.FeatureHasher

Source File: pe_imports_features.py From driverlessai-recipes with Apache License 2.0

7 votes

def imports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        imports = lief_binary.imports
        features = {}
        for lib in imports:
            if lib.name not in features:
                features[lib.name] = []
            for entry in lib.entries:
                if entry.is_ordinal:
                    features[lib.name].append("ordinal" + str(entry.ordinal))
                else:
                    features[lib.name].append(entry.name[:10000])

        features_hashed = {}
        libraries = sorted(list(set([l.lower() for l in features.keys()])))
        for i, x in enumerate(FeatureHasher(256, input_type='string').transform([libraries]).toarray()[0]):
            features_hashed.update({f'Imports_libraries_hash_{i}': x})
        entries = sorted([lib.lower() + ':' + e for lib, elist in features.items() for e in elist])
        for i, x in enumerate(FeatureHasher(1024, input_type='string').transform([entries]).toarray()[0]):
            features_hashed.update({f'Imports_entries_hash_{i}': x})
        return features_hashed

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

6 votes

def test_hash_collisions():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       n_features=1, input_type='string').fit_transform(X)
    # check that some of the hashed tokens are added
    # with an opposite sign and cancel out
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert Xt.data[0] == len(X[0])

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

6 votes

def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0
    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
                         input_type='string').fit_transform(X)
    # With initially positive features, the non_negative option should
    # have no impact when alternate_sign=False
    assert_array_equal(Xt.data, Xt_2.data)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, input_type="string",
                          alternate_sign=False)
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": "abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2)

Source File: run_model.py From kaggle_avazu_benchmark with Apache License 2.0

6 votes

def main(neg_rate, submission_num, n_iter, train_path):
    ids = [x for x in pp.get_int_field('id', 'original_data/test')]
    clicks = pp.get_int_field('click', train_path)
    # Get Data Generators
    train = pp.data_generator(pp.clean_parse_row, train_path)
    test = pp.data_generator(pp.clean_parse_row, 'original_data/test')

    # Define estimators
    fh = FeatureHasher(n_features=2 ** 20, input_type='pair')
    sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2')

    #Fit pipeline
    pipeline = ml.PartialFitter([fh, sgd],
                                batch_size=10000,
                                logging=True,
                                n_iter=n_iter,
                                neg_rate=neg_rate)

    pipeline.partial_fit(X=train, y=clicks)
    # Correct Intercept
    pipeline.steps[-1].intercept_[0] += np.log(neg_rate)
    preds = pipeline.predict_proba(newX=test)[:, 1]
    pp.write_submission(number=submission_num, ids=ids, preds=preds)

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

6 votes

def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": u"abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2)

Source File: test_feature_extraction.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

6 votes

def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6)

Source File: pefeatures.py From youarespecial with MIT License

5 votes

def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype)

Source File: _machine_learning.py From qlik-py-tools with MIT License

5 votes

def hasher(df, col, n_features):
        """
        Hash the unique values in the specified column in the given dataframe, creating n_features
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        fh = FeatureHasher(n_features=n_features, input_type="string")
        hashed = fh.fit_transform(unique.loc[:, col])
        unique = unique.join(pd.DataFrame(hashed.toarray()).add_prefix(col))
        return unique.set_index(col)

Source File: pefeatures.py From youarespecial with MIT License

5 votes

def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)

Source File: pefeatures.py From youarespecial with MIT License

5 votes

def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype)

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
             {"foo": "baz", "gaga": u"string1"}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray())

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    assert_raises(TypeError, hasher.fit)

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,))

Source File: test_feature_hasher.py From twitter-stock-recommendation with MIT License

5 votes

def test_hasher_negative():
    X = [{"foo": 2, "bar": -4, "baz": -1}.items()]
    Xt = FeatureHasher(alternate_sign=False, non_negative=False,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() > 0)
    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       input_type="pair").fit_transform(X)
    assert_true(Xt.data.min() > 0)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0

Source File: pefeatures.py From gym-malware with MIT License

5 votes

def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype)

Source File: pefeatures.py From gym-malware with MIT License

5 votes

def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)

Source File: pefeatures.py From gym-malware with MIT License

5 votes

def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
             {"foo": "baz", "gaga": "string1"}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray())

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    assert_raises(TypeError, hasher.fit)

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,))

Source File: test_feature_hasher.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hash_collisions():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, n_features=1,
                       input_type='string').fit_transform(X)
    # check that some of the hashed tokens are added
    # with an opposite sign and cancel out
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=False, n_features=1,
                       input_type='string').fit_transform(X)
    assert Xt.data[0] == len(X[0])

Source File: pe_exports_features.py From driverlessai-recipes with Apache License 2.0

5 votes

def exports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        exports = sorted(lief_binary.exported_functions)

        features_hashed = {}
        if exports:
            for i, x in enumerate(FeatureHasher(128, input_type='string').transform(exports).toarray()[0]):
                features_hashed.update({f'Exports_functions_hash_{i}': x})
        else:
            for i in range(128):
                features_hashed.update({f'Exports_functions_hash_{i}': 0})

        return features_hashed

Python sklearn.feature_extraction.FeatureHasher() Examples