Python Examples of sklearn.ensemble.IsolationForest

Source File: isoforest.py From Deep-SAD-PyTorch with MIT License

6 votes

def __init__(self, hybrid=False, n_estimators=100, max_samples='auto', contamination=0.1, n_jobs=-1, seed=None,
                 **kwargs):
        """Init Isolation Forest instance."""
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.contamination = contamination
        self.n_jobs = n_jobs
        self.seed = seed

        self.model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination,
                                     n_jobs=n_jobs, random_state=seed, **kwargs)

        self.hybrid = hybrid
        self.ae_net = None  # autoencoder network for the case of a hybrid model

        self.results = {
            'train_time': None,
            'test_time': None,
            'test_auc': None,
            'test_scores': None
        }

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

6 votes

def test_iforest_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data

    # Test max_samples
    assert_raises(ValueError,
                  IsolationForest(max_samples=-1).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=0.0).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=2.0).fit, X)
    # The dataset has less than 256 samples, explicitly setting
    # max_samples > n_samples should result in a warning. If not set
    # explicitly there should be no warning
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         IsolationForest(max_samples=1000).fit, X)
    assert_no_warnings(IsolationForest(max_samples='auto').fit, X)
    assert_no_warnings(IsolationForest(max_samples=np.int64(2)).fit, X)
    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

6 votes

def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)

Source File: iso_forest.py From safekit with MIT License

6 votes

def sample_hyps_iso_forest(nest, contam, boot):
    """

    :param nest:
    :param contam:
    :param boot:
    :return: An IsolationForest object with specified hyperparameters, used to detect anomaly.
    """

    n_estimators = nest # random.choice(range(20, 300))  # default is 100
    max_samples = 'auto'
    contamination = contam #randrange_float(0.0, 0.5, 0.05)
    max_features = 1.0 # default is 1.0 (use all features)
    bootstrap = boot # random.choice(['True', 'False'])
    n_jobs = -1  # Uses all cores
    verbose = 0

    model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples,
                            contamination=contamination, max_features=max_features,
                            bootstrap=bootstrap, n_jobs=n_jobs, verbose=verbose)
    return model

Source File: test_iforest.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_deprecation():
    X = [[0.0], [1.0]]
    clf = IsolationForest()

    assert_warns_message(FutureWarning,
                         'default contamination parameter 0.1 will change '
                         'in version 0.22 to "auto"',
                         clf.fit, X)

    assert_warns_message(FutureWarning,
                         'behaviour="old" is deprecated and will be removed '
                         'in version 0.22',
                         clf.fit, X)

    clf = IsolationForest().fit(X)
    assert_warns_message(DeprecationWarning,
                         "threshold_ attribute is deprecated in 0.20 and will"
                         " be removed in 0.22.",
                         getattr, clf, "threshold_")

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

6 votes

def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)

Source File: test_iforest.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)

Source File: outliers.py From visualqc with Apache License 2.0

6 votes

def run_isolation_forest(features, id_list, fraction_of_outliers=.3):
    """Performs anomaly detection based on Isolation Forest."""

    rng = np.random.RandomState(1984)

    num_samples = features.shape[0]
    iso_f = IsolationForest(max_samples=num_samples,
                            contamination=fraction_of_outliers,
                            random_state=rng)
    iso_f.fit(features)
    pred_scores = iso_f.decision_function(features)

    threshold = stats.scoreatpercentile(pred_scores, 100 * fraction_of_outliers)
    outlying_ids = id_list[pred_scores < threshold]

    return outlying_ids

Source File: test_iforest.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)

Source File: test_outlier_remover.py From scikit-lego with MIT License

5 votes

def test_pipeline_integration():
    np.random.seed(42)
    dataset = np.concatenate([np.random.normal(0, 1, (2000, 2))])
    isolation_forest_remover = OutlierRemover(outlier_detector=IsolationForest())
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector())
    pipeline = Pipeline(
        [
            ("isolation_forest_remover", isolation_forest_remover),
            ("gmm_remover", gmm_remover),
            ("kmeans", KMeans()),
        ]
    )
    pipeline.fit(dataset)
    pipeline.transform(dataset)

Source File: random_split_trees.py From ad_examples with MIT License

5 votes

def fit(self, X, y=None, sample_weight=None):
        self.ifor = IsolationForest(n_estimators=self.n_estimators,
                                    max_samples=self.max_samples,
                                    contamination=self.contamination,
                                    max_features=self.max_features,
                                    bootstrap=self.bootstrap,
                                    n_jobs=self.n_jobs,
                                    random_state=self.random_state,
                                    verbose=self.verbose)
        self.ifor.fit(X, y, sample_weight)
        self.estimators_ = self.ifor.estimators_
        self.estimators_features_ = self.ifor.estimators_features_
        self.updated = False

Source File: multiview_forest.py From ad_examples with MIT License

5 votes

def _multiview_fit(self, X, y, feature_partitions, n_estimators_view):
        n_features = X.shape[1]

        estimators_group = []
        feature_offset = 0
        logger.debug("IForestMultiview n_estimators_view: %s" % str(list(n_estimators_view)))
        for n_feats, n_est_ in zip(feature_partitions, n_estimators_view):
            estimators = []
            X_ = X[:, feature_offset:(feature_offset+n_feats)]

            if n_est_ > 0:
                # contruct isolation forest for the view containing just the feature subset
                ifor_ = IsolationForest(n_estimators=n_est_,
                                        max_samples=self.max_samples,
                                        contamination=self.contamination,
                                        max_features=self.max_features,
                                        bootstrap=self.bootstrap,
                                        n_jobs=self.n_jobs,
                                        random_state=self.random_state,
                                        verbose=self.verbose)
                ifor_.fit(X_, y, sample_weight=None)

                for tree in ifor_.estimators_:
                    # The IsolationForest trees contain read-only properties. We copy
                    # over all the properties to our custom tree structure so that we
                    # can modify them if needed.
                    ifor_mv_estimator = IForestMultiviewTree(n_features=n_features, ifor_tree=tree.tree_)

                    # adjust the feature indexes at the tree nodes.
                    ifor_mv_estimator.tree_.feature += feature_offset

                    estimators.append(ifor_mv_estimator)

            estimators_group.append(estimators)
            feature_offset += n_feats

        return estimators_group

Source File: test_iso_gan.py From ad_examples with MIT License

5 votes

def get_iso_model(x, y, opts):
    outliers_fraction = 0.1
    ifor_random_state = opts.randseed
    iso_model = IsolationForest(n_estimators=100, max_samples=256,
                                contamination=outliers_fraction,
                                random_state=ifor_random_state)
    iso_model.fit(x)
    r = np.reshape(iso_model.decision_function(x), (-1, 1))
    # logger.debug("iforest r:\n%s" % str(list(r)))
    return iso_model, r

Source File: IsolationForest.py From mltk-algo-contrib with Apache License 2.0

5 votes

def __init__(self,options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params',{}),
            ints = ['n_estimators','n_jobs','random_state','verbose'],
            floats = ['max_samples','contamination','max_features'],
            bools = ['bootstrap']
            )
        self.return_scores = out_params.pop('anomaly_score', True)

        # whitelist n_estimators > 0
        if 'n_estimators' in out_params and out_params['n_estimators']<=0:
            msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
            raise RuntimeError(msg.format(out_params['n_estimators']))
        
        # whitelist max_samples > 0 and < 1
        if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
            msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
            raise RuntimeError(msg.format(out_params['max_samples']))
        
        #   whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
        if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
            msg = (
                'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
                'but found contamination="{}".'
            )
            raise RuntimeError(msg.format(out_params['contamination']))

        # whitelist max_features > 0 and < 1
        if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
            msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
            raise RuntimeError(msg.format(out_params['max_features']))

        
        self.estimator = _IsolationForest(**out_params)

Source File: test_outlier_remover.py From scikit-lego with MIT License

5 votes

def test_estimator_checks(test_fn):
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(), refit=True)
    test_fn(OutlierRemover.__name__, gmm_remover)

    isolation_forest_remover = OutlierRemover(
        outlier_detector=IsolationForest(), refit=True
    )
    test_fn(OutlierRemover.__name__, isolation_forest_remover)

Source File: model.py From batea with GNU General Public License v2.0

5 votes

def build_model(self, outlier_ratio=0.1, n_estimators=100, max_samples='auto'):
        self.model = IsolationForest(contamination=outlier_ratio,
                                     n_estimators=n_estimators,
                                     max_samples=max_samples,
                                     behaviour='new')

Source File: test_outlier_remover.py From scikit-lego with MIT License

5 votes

def test_estimator_checks(test_fn):
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(), refit=True)
    test_fn(OutlierRemover.__name__, gmm_remover)

    isolation_forest_remover = OutlierRemover(
        outlier_detector=IsolationForest(), refit=True
    )
    test_fn(OutlierRemover.__name__, isolation_forest_remover)

Source File: test_outlier_remover.py From scikit-lego with MIT License

5 votes

def test_pipeline_integration():
    np.random.seed(42)
    dataset = np.concatenate([np.random.normal(0, 1, (2000, 2))])
    isolation_forest_remover = OutlierRemover(outlier_detector=IsolationForest())
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector())
    pipeline = Pipeline(
        [
            ("isolation_forest_remover", isolation_forest_remover),
            ("gmm_remover", gmm_remover),
            ("kmeans", KMeans()),
        ]
    )
    pipeline.fit(dataset)
    pipeline.transform(dataset)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_max_samples_attribute():
    X = iris.data
    clf = IsolationForest().fit(X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=500)
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         clf.fit, X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=0.4).fit(X)
    assert_equal(clf.max_samples_, 0.4*X.shape[0])

Source File: isoForest.py From Deep-SVDD with MIT License

5 votes

def initialize_isoForest(self, seed=0, **kwargs):

        self.isoForest = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples,
                                         contamination=self.contamination, n_jobs=-1, random_state=seed, **kwargs)

Source File: test_ensemble.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.ensemble.AdaBoostClassifier,
                      ensemble.AdaBoostClassifier)
        self.assertIs(df.ensemble.AdaBoostRegressor,
                      ensemble.AdaBoostRegressor)
        self.assertIs(df.ensemble.BaggingClassifier,
                      ensemble.BaggingClassifier)
        self.assertIs(df.ensemble.BaggingRegressor,
                      ensemble.BaggingRegressor)
        self.assertIs(df.ensemble.ExtraTreesClassifier,
                      ensemble.ExtraTreesClassifier)
        self.assertIs(df.ensemble.ExtraTreesRegressor,
                      ensemble.ExtraTreesRegressor)

        self.assertIs(df.ensemble.GradientBoostingClassifier,
                      ensemble.GradientBoostingClassifier)
        self.assertIs(df.ensemble.GradientBoostingRegressor,
                      ensemble.GradientBoostingRegressor)

        self.assertIs(df.ensemble.IsolationForest,
                      ensemble.IsolationForest)

        self.assertIs(df.ensemble.RandomForestClassifier,
                      ensemble.RandomForestClassifier)
        self.assertIs(df.ensemble.RandomTreesEmbedding,
                      ensemble.RandomTreesEmbedding)
        self.assertIs(df.ensemble.RandomForestRegressor,
                      ensemble.RandomForestRegressor)

        self.assertIs(df.ensemble.VotingClassifier,
                      ensemble.VotingClassifier)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_iforest():
    """Check Isolation Forest for various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({"n_estimators": [3],
                          "max_samples": [0.5, 1.0, 3],
                          "bootstrap": [True, False]})

    with ignore_warnings():
        for params in grid:
            IsolationForest(random_state=rng,
                            **params).fit(X_train).predict(X_test)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_recalculate_max_depth():
    """Check max_depth recalculation when max_samples is reset to n_samples"""
    X = iris.data
    clf = IsolationForest().fit(X)
    for est in clf.estimators_:
        assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))

Source File: test_iforest.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_iforest_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data

    # Test max_samples
    assert_raises(ValueError,
                  IsolationForest(max_samples=-1).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=0.0).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=2.0).fit, X)
    # The dataset has less than 256 samples, explicitly setting
    # max_samples > n_samples should result in a warning. If not set
    # explicitly there should be no warning
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         IsolationForest(max_samples=1000).fit, X)
    # note that assert_no_warnings does not apply since it enables a
    # PendingDeprecationWarning triggered by scipy.sparse's use of
    # np.matrix. See issue #11251.
    with pytest.warns(None) as record:
        IsolationForest(max_samples='auto').fit(X)
    user_warnings = [each for each in record
                     if issubclass(each.category, UserWarning)]
    assert len(user_warnings) == 0
    with pytest.warns(None) as record:
        IsolationForest(max_samples=np.int64(2)).fit(X)
    user_warnings = [each for each in record
                     if issubclass(each.category, UserWarning)]
    assert len(user_warnings) == 0

    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)

    # test X_test n_features match X_train one:
    assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])

    # test threshold_ attribute error when behaviour is not old:
    msg = "threshold_ attribute does not exist when behaviour != 'old'"
    assert_raises_regex(AttributeError, msg, getattr,
                        IsolationForest(behaviour='new'), 'threshold_')

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_iforest_subsampled_features():
    # It tests non-regression for #5732 which failed at predict.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    clf = IsolationForest(max_features=0.8)
    clf.fit(X_train, y_train)
    clf.predict(X_test)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_max_samples_consistency():
    # Make sure validated max_samples in iforest and BaseBagging are identical
    X = iris.data
    clf = IsolationForest().fit(X)
    assert_equal(clf.max_samples_, clf._max_samples)

Source File: test_iforest.py From twitter-stock-recommendation with MIT License

5 votes

def test_iforest_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test LOF
    clf = IsolationForest(random_state=rng, contamination=0.25)
    clf.fit(X)
    decision_func = - clf.decision_function(X)
    pred = clf.predict(X)

    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1])

Source File: isolation_forest.py From monasca-analytics with Apache License 2.0

5 votes

def __init__(self, _id, _config):
        super(IsolationForest, self).__init__(_id, _config)
        self._nb_samples = int(_config['nb_samples'])

Source File: isolation_forest.py From monasca-analytics with Apache License 2.0

5 votes

def get_default_config():
        return {
            'module': IsolationForest.__name__,
            'nb_samples': N_SAMPLES
        }

Source File: isolation_forest.py From monasca-analytics with Apache License 2.0

5 votes

def _get_best_detector(self, train):
        detector = ensemble.IsolationForest()
        detector.fit(train)
        return detector

Python sklearn.ensemble.IsolationForest() Examples