Python Examples of sklearn.base.BaseEstimator

Source File: test_builder.py From gordo with GNU Affero General Public License v3.0

7 votes

def test_get_metadata_helper(model: BaseEstimator, expect_empty_dict: bool):
    """
    Ensure the builder works with various model configs and that each has
    expected/valid metadata results.
    """

    X, y = np.random.random((1000, 4)), np.random.random((1000,))

    model.fit(X, y)

    metadata = ModelBuilder._extract_metadata_from_model(model)

    # All the metadata we've implemented so far is 'history', so we'll check that
    if not expect_empty_dict:
        assert "history" in metadata
        assert all(
            name in metadata["history"] for name in ("params", "loss", "accuracy")
        )
    else:
        assert dict() == metadata

Source File: mis_classifier.py From autoimpute with MIT License

6 votes

def __init__(self, classifier=None, predictors="all"):
        """Create an instance of the MissingnessClassifier.

        The MissingnessClassifier inherits from sklearn BaseEstimator and
        ClassifierMixin. This inheritence and this class' implementation
        ensure that the MissingnessClassifier is a valid classifier that will
        work in an sklearn pipeline.

        Args:
            classifier (classifier, optional): valid classifier from sklearn.
                If None, default is xgboost. Note that classifier must
                conform to sklearn style. This means it must implement the
                `predict_proba` method and act as a porper classifier.
            predictors (str, iter, dict, optiona): defaults to all, i.e.
                use all predictors. If all, every column will be used for
                every class prediction. If a list, subset of columns used for
                all predictions. If a dict, specify which columns to use as
                predictors for each imputation. Columns not specified in dict
                will receive `all` by default.
        """
        self.classifier = classifier
        self.predictors = predictors

Source File: uncertainty.py From modAL with MIT License

6 votes

def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Classification uncertainty of the classifier for the provided samples.

    Args:
        classifier: The classifier for which the uncertainty is to be measured.
        X: The samples for which the uncertainty of classification is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Classifier uncertainty, which is 1 - P(prediction is correct).
    """
    # calculate uncertainty for each point provided
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.ones(shape=(X.shape[0], ))

    # for each point, select the maximum uncertainty
    uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
    return uncertainty

Source File: base.py From modAL with MIT License

6 votes

def __init__(self,
                 estimator: BaseEstimator,
                 query_strategy: Callable,
                 X_training: Optional[modALinput] = None,
                 y_training: Optional[modALinput] = None,
                 bootstrap_init: bool = False,
                 force_all_finite: bool = True,
                 **fit_kwargs
                 ) -> None:
        assert callable(query_strategy), 'query_strategy must be callable'

        self.estimator = estimator
        self.query_strategy = query_strategy

        self.X_training = X_training
        self.y_training = y_training
        if X_training is not None:
            self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)

        assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
        self.force_all_finite = force_all_finite

Source File: validation.py From modAL with MIT License

6 votes

def check_class_labels(*args: BaseEstimator) -> bool:
    """
    Checks the known class labels for each classifier.

    Args:
        *args: Classifier objects to check the known class labels.

    Returns:
        True, if class labels match for all classifiers, False otherwise.
    """
    try:
        classes_ = [estimator.classes_ for estimator in args]
    except AttributeError:
        raise NotFittedError('Not all estimators are fitted. Fit all estimators before using this method.')

    for classifier_idx in range(len(args) - 1):
        if not np.array_equal(classes_[classifier_idx], classes_[classifier_idx+1]):
            return False

    return True

Source File: test_weight_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_sample_weight_adaboost_regressor():
    """
    AdaBoostRegressor should work without sample_weights in the base estimator
    The random weighted sampling is done internally in the _boost method in
    AdaBoostRegressor.
    """
    class DummyEstimator(BaseEstimator):

        def fit(self, X, y):
            pass

        def predict(self, X):
            return np.zeros(X.shape[0])

    boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
    boost.fit(X, y_regr)
    assert_equal(len(boost.estimator_weights_), len(boost.estimator_errors_))

Source File: base.py From Neuraxle with Apache License 2.0

6 votes

def tosklearn(self):
        class NeuraxleToSKLearnPipelineWrapper(BaseEstimator):
            def __init__(self, neuraxle_step):
                self.p: Union[BaseStep, TruncableSteps] = neuraxle_step

            def set_params(self, **params) -> BaseEstimator:
                self.p.set_hyperparams(HyperparameterSpace(params))
                return self

            def get_params(self, deep=True):
                neuraxle_params = HyperparameterSamples(self.p.get_hyperparams()).to_flat_as_dict_primitive()
                return neuraxle_params

            def get_params_space(self, deep=True):
                neuraxle_params = HyperparameterSpace(self.p.get_hyperparams_space()).to_flat_as_dict_primitive()
                return neuraxle_params

            def fit(self, **args) -> BaseEstimator:
                self.p = self.p.fit(**args)

            def transform(self, **args) -> BaseEstimator:
                return self.p.transform(**args)

        return NeuraxleToSKLearnPipelineWrapper(self)

Source File: run.py From nyaggle with MIT License

6 votes

def _dispatch_models(algorithm_type: Union[str, Type[BaseEstimator]],
                     target_type: str, custom_eval: Optional[Callable] = None):
    if not isinstance(algorithm_type, str):
        assert issubclass(algorithm_type, BaseEstimator), "algorithm_type should be str or subclass of BaseEstimator"
        return algorithm_type, _dispatch_eval_func(target_type, custom_eval), None

    cat_features = {
        'lgbm': 'categorical_feature',
        'cat': 'cat_features',
        'xgb': None
    }

    gbdt_class = _dispatch_gbdt_class(algorithm_type, target_type)
    eval_func = _dispatch_eval_func(target_type, custom_eval)

    return gbdt_class, eval_func, cat_features[algorithm_type]

Source File: model.py From gobbli with Apache License 2.0

6 votes

def persist_estimator(estimator: BaseEstimator) -> Path:
    """
    Saves the given estimator to a gobbli-managed filepath, where it can be loaded from
    disk by the SKLearnClassifier.  This is useful if you want to use an estimator but
    don't want to bother with saving it to disk on your own.

    Args:
      estimator: The estimator to load.

    Returns:
      The path where the estimator was saved.
    """
    estimator_dir = (
        SKLearnClassifier.model_class_dir() / "user_estimators" / generate_uuid()
    )
    estimator_dir.mkdir(exist_ok=True, parents=True)

    estimator_path = estimator_dir / SKLearnClassifier._TRAIN_OUTPUT_CHECKPOINT
    SKLearnClassifier._dump_estimator(estimator, estimator_path)

    return estimator_path

Source File: combination.py From modAL with MIT License

6 votes

def make_query_strategy(utility_measure: Callable, selector: Callable) -> Callable:
    """
    Takes the given utility measure and selector functions and makes a query strategy by combining them.

    Args:
        utility_measure: Utility measure, for instance :func:`~modAL.disagreement.vote_entropy`, but it can be a custom
            function as well. Should take a classifier and the unlabelled data and should return an array containing the
            utility scores.
        selector: Function selecting instances for query. Should take an array of utility scores and should return an
            array containing the queried items.

    Returns:
        A function which returns queried instances given a classifier and an unlabelled pool.
    """
    def query_strategy(classifier: BaseEstimator, X: modALinput) -> Tuple:
        utility = utility_measure(classifier, X)
        query_idx = selector(utility)
        return query_idx, X[query_idx]

    return query_strategy

Source File: test_calibration.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_calibration_accepts_ndarray(X):
    """Test that calibration accepts n-dimensional arrays as input"""
    y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0]

    class MockTensorClassifier(BaseEstimator):
        """A toy estimator that accepts tensor inputs"""

        def fit(self, X, y):
            self.classes_ = np.unique(y)
            return self

        def decision_function(self, X):
            # toy decision function that just needs to have the right shape:
            return X.reshape(X.shape[0], -1).sum(axis=1)

    calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
    # we should be able to fit this classifier with no error
    calibrated_clf.fit(X, y)

Source File: build_model.py From gordo with GNU Affero General Public License v3.0

6 votes

def _determine_offset(
        model: BaseEstimator, X: Union[np.ndarray, pd.DataFrame]
    ) -> int:
        """
        Determine the model's offset. How much does the output of the model differ
        from its input?

        Parameters
        ----------
        model: sklearn.base.BaseEstimator
            Trained model with either ``predict`` or ``transform`` method, preference
            given to ``predict``.
        X: Union[np.ndarray, pd.DataFrame]
            Data to pass to the model's ``predict`` or ``transform`` method.

        Returns
        -------
        int
            The difference between X and the model's output lengths.
        """
        out = model.predict(X) if hasattr(model, "predict") else model.transform(X)
        return len(X) - len(out)

Source File: uncertainty.py From modAL with MIT License

6 votes

def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Entropy of predictions of the for the provided samples.

    Args:
        classifier: The classifier for which the prediction entropy is to be measured.
        X: The samples for which the prediction entropy is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Entropy of the class probabilities.
    """
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.zeros(shape=(X.shape[0], ))

    return np.transpose(entropy(np.transpose(classwise_uncertainty)))

Source File: utils.py From gordo with GNU Affero General Public License v3.0

6 votes

def load_model(directory: str, name: str) -> BaseEstimator:
    """
    Load a given model from the directory by name.

    Parameters
    ----------
    directory: str
        Directory to look for the model
    name: str
        Name of the model to load, this would be the sub directory within the
        directory parameter.

    Returns
    -------
    BaseEstimator
    """
    start_time = timeit.default_timer()
    model = serializer.load(os.path.join(directory, name))
    logger.debug(f"Time to load model: {timeit.default_timer() - start_time}s")
    return model

Source File: model.py From gobbli with Apache License 2.0

6 votes

def _validate_estimator(estimator: BaseEstimator):
        """
        Run some checks on the given object to determine if it's an estimator which is
        valid for our purposes.
        """
        # sklearn has a function that does a lot more intensive checking regarding
        # the interface of a candidate Estimator
        # (sklearn.utils.estimator_checks.check_estimator), but the function
        # doesn't work well for our use case as of version 0.22.  It doesn't properly
        # detect Pipeline X_types based on the first pipeline component and won't
        # test anything that doesn't accept a 2-D numpy array as input.  We'll settle
        # for lax checks here until sklearn has something that works better for us.
        if not is_classifier(estimator):
            raise ValueError(
                "Estimator must be a classifier according to sklearn.base.is_classifier()"
            )

        if not hasattr(estimator, "predict_proba"):
            raise ValueError(
                "Estimator must support the predict_proba() method to fulfill gobbli's "
                "interface requirements for a prediction model."
            )

Source File: uncertainty.py From modAL with MIT License

6 votes

def classifier_margin(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the
    first and second most likely predictions and takes the difference of their probabilities, which is the margin.

    Args:
        classifier: The classifier for which the prediction margin is to be measured.
        X: The samples for which the prediction margin of classification is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions.
    """
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.zeros(shape=(X.shape[0], ))

    if classwise_uncertainty.shape[1] == 1:
        return np.zeros(shape=(classwise_uncertainty.shape[0],))

    part = np.partition(-classwise_uncertainty, 1, axis=1)
    margin = - part[:, 0] + part[:, 1]

    return margin

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

6 votes

def make_pmml_pipeline(obj, active_fields = None, target_fields = None):
	"""Translates a regular Scikit-Learn estimator or pipeline to a PMML pipeline.

	Parameters:
	----------
	obj: BaseEstimator
		The object.

	active_fields: list of strings, optional
		Feature names. If missing, "x1", "x2", .., "xn" are assumed.

	target_fields: list of strings, optional
		Label name(s). If missing, "y" is assumed.

	"""
	steps = _filter_steps(_get_steps(obj))
	pipeline = PMMLPipeline(steps)
	if active_fields is not None:
		pipeline.active_fields = numpy.asarray(active_fields)
	if target_fields is not None:
		pipeline.target_fields = numpy.asarray(target_fields)
	return pipeline

Source File: cli.py From skorch with BSD 3-Clause "New" or "Revised" License

6 votes

def print_help(model, defaults=None):
    """Print help for the command line arguments of the given model.

    Parameters
    ----------
    model : sklearn.base.BaseEstimator
      The basic model, e.g. a ``NeuralNet`` or sklearn ``Pipeline``.

    defautls : dict or None (default=None)
      Optionally, change the default values to use custom
      defaults. Commandline arguments have precedence over defaults.

    """
    defaults = defaults or {}

    print("This is the help for the model-specific parameters.")
    print("To invoke help for the remaining options, run:")
    print("python {} -- --help".format(sys.argv[0]))
    print()

    lines = (_get_help_for_estimator(prefix, estimator, defaults=defaults) for
             prefix, estimator in _yield_estimators(model))
    print('\n'.join(chain(*lines)))

Source File: _normalize.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def normalize_estimator(est):
    """Normalize an estimator.

    Note: Since scikit-learn requires duck-typing, but not sub-typing from
    ``BaseEstimator``, we sometimes need to call this function directly."""
    base = [type(est).__name__, normalize_token(est.get_params())]
    # fitted attributes: https://github.com/dask/dask-ml/issues/658
    attrs = [x for x in dir(est) if x.endswith("_") and not x.startswith("_")]
    exclude = {"cv_results_", "model_history_", "history_", "refit_time_"}

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        for attr in attrs:
            if attr in exclude:
                continue
            try:
                val = getattr(est, attr)
            except (sklearn.exceptions.NotFittedError, AttributeError):
                continue
            base.append(val)
    return tuple(base)

Source File: test_sklearn_model_io.py From kryptoflow with GNU General Public License v3.0

6 votes

def test_trainable_model_from_file(sklearn_model, project_manager):
    skl = SklearnModel(artifact=sklearn_model)
    # lr = LogisticRegression()
    # trainable = TrainableModel(artifact=lr)

    skl.store(name='clf')
    trainable = TrainableModel.from_file(run_number=1, name='clf', model_type='sklearn')
    assert isinstance(trainable.model, BaseEstimator)
    for root, dirs, files in os.walk(project_manager.CONFIG['saved-models']):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))

    with open(os.path.join(project_manager.CONFIG['saved-models'], '.gitkeep'), 'w') as gitkeep:
        gitkeep.write('empty')

Source File: test_sklearn_model_io.py From kryptoflow with GNU General Public License v3.0

6 votes

def test_loader(sklearn_model, project_manager):
    skl = SklearnModel(artifact=sklearn_model)
    skl.store(name='clf')
    reloaded = skl.load(name='clf')
    assert isinstance(reloaded, BaseEstimator)

    skl2 = SklearnModel(artifact=sklearn_model)
    skl2.store(name='clf')
    reload_first = skl.load(run_number=1, name='clf')
    assert isinstance(reload_first, BaseEstimator)

    for root, dirs, files in os.walk(project_manager.CONFIG['saved-models']):
        for f in files:
            os.unlink(os.path.join(root, f))
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))

    with open(os.path.join(project_manager.CONFIG['saved-models'], '.gitkeep'), 'w') as gitkeep:
        gitkeep.write('empty')

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def verify(self, X, predict_params = {}, predict_proba_params = {}, precision = 1e-13, zeroThreshold = 1e-13):
		active_fields = _get_column_names(X)
		if self.active_fields is None or active_fields is None:
			raise ValueError("Cannot perform model validation with anonymous data")
		if self.active_fields.tolist() != active_fields.tolist():
			raise ValueError("The columns between training data {} and verification data {} do not match".format(self.active_fields, active_fields))
		active_values = _get_values(X)
		y = self.predict(X, **predict_params)
		target_values = _get_values(y)
		estimator = self._final_estimator
		if isinstance(estimator, BaseEstimator):
			if isinstance(estimator, RegressorMixin):
				self.verification = _Verification(active_values, target_values, precision, zeroThreshold)
			elif isinstance(estimator, ClassifierMixin):
				self.verification = _Verification(active_values, target_values, precision, zeroThreshold)
				if hasattr(estimator, "predict_proba"):
					try:
						y_proba = self.predict_proba(X, **predict_proba_params)
						self.verification.probability_values = _get_values(y_proba)
					except AttributeError:
						pass
		# elif isinstance(estimator, H2OEstimator):
		elif hasattr(estimator, "_estimator_type") and hasattr(estimator, "download_mojo"):
			if estimator._estimator_type == "regressor":
				self.verification = _Verification(active_values, target_values, precision, zeroThreshold)
			elif estimator._estimator_type == "classifier":
				probability_values = target_values[:, 1:]
				target_values = target_values[:, 0]
				self.verification = _Verification(active_values, target_values, precision, zeroThreshold)
				self.verification.probability_values = probability_values

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def _get_steps(obj):
	if isinstance(obj, Pipeline):
		return obj.steps
	elif isinstance(obj, BaseEstimator):
		return [("estimator", obj)]
	else:
		raise ValueError()

Source File: test_preprocessing.py From skl-groups with BSD 3-Clause "New" or "Revised" License

5 votes

def test_basic():
    bags = [np.random.normal(5, 3, size=(np.random.randint(10, 100), 20))
            for _ in xrange(50)]
    feats = Features(bags, stack=True)

    stder = BagStandardizer()
    stdized = stder.fit_transform(bags)
    stdized.make_stacked()

    assert np.allclose(np.mean(stdized.stacked_features), 0)
    assert np.allclose(np.std(stdized.stacked_features), 1)

    first_five = stder.transform(bags[:5])
    assert first_five == stdized[:5]

    minmaxer = BagMinMaxScaler([3, 7])
    minmaxed = minmaxer.fit_transform(feats)
    minmaxed.make_stacked()
    assert np.allclose(np.min(minmaxed.stacked_features, 0), 3)
    assert np.allclose(np.max(minmaxed.stacked_features, 0), 7)

    normer = BagNormalizer('l1')
    normed = normer.fit_transform(Features(bags))
    normed.make_stacked()
    assert np.allclose(np.sum(np.abs(normed.stacked_features), 1), 1)

    class GetMean(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return X.mean(axis=1)[None, :]
    m = BagPreprocesser(GetMean())
    assert_raises(ValueError, lambda: m.transform(bags))

Source File: learners.py From modAL with MIT License

5 votes

def __init__(self,
                 estimator: BaseEstimator,
                 query_strategy: Callable = uncertainty_sampling,
                 X_training: Optional[modALinput] = None,
                 y_training: Optional[modALinput] = None,
                 bootstrap_init: bool = False,
                 **fit_kwargs
                 ) -> None:
        super().__init__(estimator, query_strategy,
                         X_training, y_training, bootstrap_init, **fit_kwargs)

Source File: disagreement.py From modAL with MIT License

5 votes

def max_std_sampling(regressor: BaseEstimator, X: modALinput,
                     n_instances: int = 1,  random_tie_break=False,
                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
        query_idx = multi_argmax(std, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(std, n_instances=n_instances)

    return query_idx, X[query_idx]

Source File: VectorQuantizer.py From stochastic_PMF with GNU General Public License v3.0

5 votes

def __init__(self, clusterer=None, n_atoms=32, sparse=True, batch_size=1024, n_quantizers=1):
        '''Vector quantization by closest centroid:

        A[i] > 0 <=> i in argmin ||X - C_i||

        This implementation also supports soft encoding by mapping to the top k
        closest centroids.

        Arguments:
        ----------
        n_atoms : int
            Number of dictionary elements to extract
    
        clusterer : {None, BaseEstimator}
            Instantiation of a clustering object (eg. sklearn.cluster.MiniBatchKMeans)

            default: sklearn.cluster.MiniBatchKMeans

        n_atoms : int
            If no clusterer is provided, the number of atoms to use

        sparse : bool
            Represent encoded data as a sparse matrix or ndarray

        batch_size : int
            Number of points to transform in parallel

        n_quantizers : int
            Number of quantizers to use for each point.
            By default, it uses 1 (hard VQ).
            Larger values use multiple codewords to represent each point.
        '''

        if clusterer is None:
            self.clusterer = sklearn.cluster.MiniBatchKMeans(n_clusters=n_atoms)
        else:
            self.clusterer = clusterer

        self.sparse         = sparse
        self.batch_size     = batch_size
        self.n_quantizers   = n_quantizers

Source File: fit.py From parfit with MIT License

5 votes

def fitOne(model, X, y, params):
    """
    Makes one model fit using provided data and parameters
    :param model: The instantiated model you wish to pass, e.g. LogisticRegression()
    :param X: The independent variable data
    :param y: The response variable data
    :param params: The parameters passed through to the model from the parameter grid
    :return: Returns the fitted model
    """
    if isinstance(model, BaseEstimator):
        model.set_params(**params)
    else:
        model = model(**params)
    return model.fit(X, y)

Source File: test_step.py From baikal with BSD 3-Clause "New" or "Revised" License

5 votes

def test_get_params_without_init(self, teardown):
        """Test edge case where the base class does not define
        an __init__ method. get_params should resolve to object.__init__
        which results in an empty dict.
        """

        class TransformerWithoutInit(TransformerMixin, BaseEstimator):
            pass

        class TransformerWithoutInitStep(Step, TransformerWithoutInit):
            pass

        step = TransformerWithoutInitStep()
        assert step.get_params() == {}

Source File: test_sklearn_model_io.py From kryptoflow with GNU General Public License v3.0

5 votes

def test_trainable_model(sklearn_model):
    assert isinstance(sklearn_model, BaseEstimator)
    trainable = TrainableModel(sklearn_model)
    assert isinstance(trainable.model, BaseEstimator)
    assert isinstance(trainable.serializer, SklearnModel)

Python sklearn.base.BaseEstimator() Examples