Python xgboost.DMatrix() Examples

The following are 30 code examples of xgboost.DMatrix(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module xgboost , or try the search function .
Example #1
Source File: runner.py    From ai-platform with MIT License 8 votes vote down vote up
def fit(self):
        """
        Gets data and preprocess by prepare_data() function
        Trains with the selected parameters from grid search and saves the model
        """
        data = self.get_input()
        df_train, df_test = self.prepare_data(data)
        xtr, ytr = df_train.drop(['Value'], axis=1), df_train['Value'].values

        xgbtrain = xgb.DMatrix(xtr, ytr)
        reg_cv = self.grid_search(xtr, ytr)
        param = reg_cv.best_params_
        bst = xgb.train(dtrain=xgbtrain, params=param)

        # save model to file
        mlflow.sklearn.save_model(bst, "model")
        return df_test 
Example #2
Source File: models.py    From steppy-toolkit with MIT License 7 votes vote down vote up
def fit(self, X, y, X_valid, y_valid):
        logger.info('XGBoost, train data shape        {}'.format(X.shape))
        logger.info('XGBoost, validation data shape   {}'.format(X_valid.shape))
        logger.info('XGBoost, train labels shape      {}'.format(y.shape))
        logger.info('XGBoost, validation labels shape {}'.format(y_valid.shape))

        train = xgb.DMatrix(data=X,
                            label=y,
                            **self.dmatrix_parameters)
        valid = xgb.DMatrix(data=X_valid,
                            label=y_valid,
                            **self.dmatrix_parameters)
        self.estimator = xgb.train(params=self.booster_parameters,
                                   dtrain=train,
                                   evals=[(train, 'train'), (valid, 'valid')],
                                   **self.training_parameters)
        return self 
Example #3
Source File: test_transform.py    From pydatalab with Apache License 2.0 6 votes vote down vote up
def test_local_csv_transform(self):
    """Test transfrom from local csv files."""

    cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
           '--csv=' + self.csv_input_filepath,
           '--analysis=' + self.analysis_dir,
           '--prefix=features',
           '--output=' + self.output_dir]
    print('cmd ', ' '.join(cmd))
    subprocess.check_call(' '.join(cmd), shell=True)

    # Verify transformed file.
    libsvm_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.libsvm')
    dtrain = xgb.DMatrix(libsvm_filepath)
    self.assertTrue(2056, dtrain.num_col())
    self.assertTrue(3, dtrain.num_row())

    # Verify featuremap file.
    featuremap_filepath = os.path.join(self.output_dir, 'featuremap-00000-of-00001.txt')
    df = pd.read_csv(featuremap_filepath, names=['index', 'description'])
    pd.util.testing.assert_series_equal(pd.Series(range(1, 2056), name='index'), df['index'])
    expected_descriptions = ['cat_col=Sunday', 'cat_col=Monday', 'img_col image feature 1000',
                             'num_col', 'text_col has "blue"']
    self.assertTrue(all(x in df['description'].values for x in expected_descriptions)) 
Example #4
Source File: runner.py    From ai-platform with MIT License 6 votes vote down vote up
def predict(self, df_test):
        """
         Makes prediction for the next 7 days electricity consumption.
        """
        # load model from file
        loaded_model = mlflow.sklearn.load_model("model")
        # make predictions for test data
        xts, yts = df_test.drop(['Value'], axis=1), df_test['Value'].values
        p = loaded_model.predict(xgb.DMatrix(xts))
        prediction = pd.DataFrame({'Prediction': p})

        mape, rmse, mae, r2 = ForecastRunner.evaluation_metrics(yts, p)
        print('MAPE: {}'.format(mape))
        print('RMSE: {}'.format(rmse))
        print('R2: {}'.format(r2))
        print('MAE: {}'.format(mae))
        mlflow.log_metric("MAPE", mape)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)
        mlflow.log_metric("MAE", mae)
        ForecastRunner.plot_result(yts, p)
        self.save_output(df_test, prediction) 
Example #5
Source File: predict.py    From mars with Apache License 2.0 6 votes vote down vote up
def execute(cls, ctx, op):
        from xgboost import DMatrix

        raw_data = data = ctx[op.data.key]
        if isinstance(data, tuple):
            data = ToDMatrix.get_xgb_dmatrix(data)
        else:
            data = DMatrix(data)
        result = op.model.predict(data)

        if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE):
            result = pd.DataFrame(result, index=raw_data.index)
        elif isinstance(op.outputs[0], SERIES_CHUNK_TYPE):
            result = pd.Series(result, index=raw_data.index, name='predictions')

        ctx[op.outputs[0].key] = result 
Example #6
Source File: test_boosted_trees_classifier.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston
        import numpy as np

        scikit_data = load_boston()
        t = scikit_data.target
        target = np.digitize(t, np.histogram(t)[1]) - 1
        dtrain = xgboost.DMatrix(
            scikit_data.data, label=target, feature_names=scikit_data.feature_names
        )
        self.xgb_model = xgboost.train({}, dtrain)
        self.target = target

        # Save the data and the model
        self.scikit_data = scikit_data
        self.n_classes = len(np.unique(self.target)) 
Example #7
Source File: ch06-06-wrapper.py    From kagglebook with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def evaluate(features):
    dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
    dvalid = xgb.DMatrix(va_x[features], label=va_y)
    params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
    num_round = 10  # 実際にはもっと多いround数が必要
    early_stopping_rounds = 3
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, dtrain, num_round,
                      evals=watchlist, early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=0)
    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)

    return score


# ---------------------------------
# Greedy Forward Selection
# ---------------------------------- 
Example #8
Source File: test_boosted_trees_regression.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not _HAS_XGBOOST:
            return
        if not _HAS_SKLEARN:
            return

        scikit_data = load_boston()
        dtrain = xgboost.DMatrix(
            scikit_data.data,
            label=scikit_data.target,
            feature_names=scikit_data.feature_names,
        )
        xgb_model = xgboost.train({}, dtrain, 1)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.xgb_model = xgb_model
        self.feature_names = self.scikit_data.feature_names 
Example #9
Source File: model_xgb.py    From kagglebook with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def train(self, tr_x, tr_y, va_x=None, va_y=None):

        # データのセット
        validation = va_x is not None
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        if validation:
            dvalid = xgb.DMatrix(va_x, label=va_y)

        # ハイパーパラメータの設定
        params = dict(self.params)
        num_round = params.pop('num_round')

        # 学習
        if validation:
            early_stopping_rounds = params.pop('early_stopping_rounds')
            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
            self.model = xgb.train(params, dtrain, num_round, evals=watchlist,
                                   early_stopping_rounds=early_stopping_rounds)
        else:
            watchlist = [(dtrain, 'train')]
            self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 
Example #10
Source File: encoder.py    From sagemaker-xgboost-container with Apache License 2.0 6 votes vote down vote up
def libsvm_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a LIBSVM string representation to a DMatrix object.
    Args:
        string_like (bytes): LIBSVM string.
    Returns:
        (xgb.DMatrix): XGBoost DataMatrix
    """
    temp_file_location = None
    try:
        with tempfile.NamedTemporaryFile(delete=False) as libsvm_file:
            temp_file_location = libsvm_file.name
            libsvm_file.write(string_like)

        dmatrix = xgb.DMatrix(temp_file_location)
    finally:
        if temp_file_location and os.path.exists(temp_file_location):
            os.remove(temp_file_location)

    return dmatrix 
Example #11
Source File: level2.py    From kaggle-kuzushiji-2019 with MIT License 6 votes vote down vote up
def train_xgb(train_features, train_y, valid_features, valid_y, *,
              eta, num_boost_round):
    train_data = xgb.DMatrix(train_features, label=train_y)
    valid_data = xgb.DMatrix(valid_features, label=valid_y)
    params = {
        'eta': eta,
        'objective': 'binary:logistic',
        'gamma': 0.01,
        'max_depth': 8,
    }
    print(params)
    eval_list = [(valid_data, 'eval')]
    return xgb.train(
        params, train_data, num_boost_round, eval_list,
        early_stopping_rounds=20,
        verbose_eval=10,
    ) 
Example #12
Source File: wrap_xgb.py    From gestalt with MIT License 6 votes vote down vote up
def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 verbose_eval=self.verbose)
        return 
Example #13
Source File: data_utils.py    From sagemaker-xgboost-container with Apache License 2.0 6 votes vote down vote up
def _get_parquet_dmatrix_file_mode(files_path):
    """Get Data Matrix from parquet data in file mode.

    :param files_path: File path where parquet formatted training data resides, either directory or file
    :return: xgb.DMatrix
    """
    try:
        table = pq.read_table(files_path)

        data = table.to_pandas()
        del table

        if type(data) is pd.DataFrame:
            # pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame
            data = data.to_numpy()

        dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
        del data

        return dmatrix

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e)) 
Example #14
Source File: data_utils.py    From sagemaker-xgboost-container with Apache License 2.0 6 votes vote down vote up
def get_libsvm_dmatrix(files_path, is_pipe=False):
    """Get DMatrix from libsvm file path.

    Pipe mode not currently supported for libsvm.

    :param files_path: File path where LIBSVM formatted training data resides, either directory or file
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix
    """
    if is_pipe:
        raise exc.UserError("Pipe mode not supported for LibSVM.")

    try:
        dmatrix = xgb.DMatrix(files_path)
    except Exception as e:
        raise exc.UserError("Failed to load libsvm data with exception:\n{}".format(e))

    return dmatrix 
Example #15
Source File: test_boosted_trees_regression_numeric.py    From coremltools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUpClass(self):
        if not _HAS_XGBOOST:
            return
        if not _HAS_SKLEARN:
            return

        # Load data and train model
        scikit_data = load_boston()
        self.X = scikit_data.data.astype("f").astype("d")
        self.dtrain = xgboost.DMatrix(
            scikit_data.data,
            label=scikit_data.target,
            feature_names=scikit_data.feature_names,
        )
        self.feature_names = scikit_data.feature_names
        self.output_name = "target" 
Example #16
Source File: train_xgboost.py    From jh-kaggle-util with Apache License 2.0 6 votes vote down vote up
def train_model(self, x_train, y_train, x_val, y_val):
        print("Will train XGB for {} rounds, RandomSeed: {}".format(self.rounds, self.params['seed']))

        xg_train = xgb.DMatrix(x_train, label=y_train)

        if y_val is None:
            watchlist = [(xg_train, 'train')]
            model = xgb.train(self.params, xg_train, self.rounds, watchlist)
        else:
            early_stop = self.rounds if self.early_stop == 0 else self.early_stop
            xg_val = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(xg_train, 'train'), (xg_val, 'eval')]
            model = xgb.train(self.params, xg_train, self.rounds, watchlist, early_stopping_rounds=early_stop)

        self.steps = model.best_iteration
        return model 
Example #17
Source File: wrap_xgb.py    From gestalt with MIT License 6 votes vote down vote up
def fit(self, X, y, x_val=None, y_val=None):

        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return 
Example #18
Source File: dataset.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def deserialize(self, obj: list) -> xgboost.DMatrix:
        try:
            return xgboost.DMatrix(obj)
        except (ValueError, TypeError):
            raise DeserializationError(f'given object: {obj} could not be converted to xgboost matrix') 
Example #19
Source File: xgbranker.py    From xgboostExtension with Apache License 2.0 5 votes vote down vote up
def predict(self, X, output_margin=False, ntree_limit=0):
        X = check_array(X, accept_sparse=True)

        test_dmatrix = DMatrix(X[:,1:], missing=self.missing)
        rank_values = self.get_booster().predict(test_dmatrix,
                                                 output_margin=output_margin,
                                                 ntree_limit=ntree_limit)
        return rank_values 
Example #20
Source File: ch05-01-validation.py    From kagglebook with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred


# -----------------------------------
# hold-out法
# -----------------------------------
# hold-out法でのバリデーションデータの分割 
Example #21
Source File: xgboost_models.py    From gentun with Apache License 2.0 5 votes vote down vote up
def cross_validate(self):
        """Train model using k-fold cross validation and
        return mean value of validation metric.
        """
        d_train = xgb.DMatrix(self.x_train, label=self.y_train)
        # xgb calls its k-fold cross-validation parameter 'nfold'
        cv_result = xgb.cv(
            self.params, d_train, num_boost_round=self.num_boost_round,
            early_stopping_rounds=self.early_stopping_rounds, nfold=self.kfold
        )
        return cv_result['test-{}-mean'.format(self.eval_metric)][-1] 
Example #22
Source File: ch05-01-validation.py    From kagglebook with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 
Example #23
Source File: dataset.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def process(self, obj: xgboost.DMatrix, **kwargs) -> DatasetType:
        return DMatrixDatasetType.from_dmatrix(obj) 
Example #24
Source File: test_dataset.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def test_deserialize__np(dtype_np, np_payload):
    dmatrix = dtype_np.deserialize(np_payload)
    assert isinstance(dmatrix, xgboost.DMatrix) 
Example #25
Source File: dataset.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def from_dmatrix(cls, dmatrix: xgboost.DMatrix):
        """
        Factory method to extract :class:`~.DatasetType` from actual xgboost.DMatrix

        :param dmatrix: obj to create :class:`~.DatasetType` from
        :return: :class:`DMatrixDatasetType`
        """
        is_from_list = (dmatrix.feature_names == [f'f{i}' for i in range(dmatrix.num_col())])
        return DMatrixDatasetType(is_from_list, dmatrix.feature_types, dmatrix.feature_names) 
Example #26
Source File: dataset.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def serialize(self, instance: xgboost.DMatrix) -> list:
        """
        Raises an error because there is no way to extract original data from DMatrix
        """
        raise SerializationError('xgboost matrix does not support serialization') 
Example #27
Source File: model.py    From ebonite with Apache License 2.0 5 votes vote down vote up
def _predict(self, data):
        if not isinstance(data, xgboost.DMatrix):
            data = xgboost.DMatrix(data)
        return self.model.predict(data) 
Example #28
Source File: xgboost.py    From mljar-supervised with MIT License 5 votes vote down vote up
def predict(self, X):
        if self.model is None:
            raise XgbAlgorithmException("Xgboost model is None")

        dtrain = xgb.DMatrix(X, missing=np.NaN)
        a = self.model.predict(dtrain, ntree_limit=self.best_ntree_limit)
        return a 
Example #29
Source File: wrap_xgb.py    From gestalt with MIT License 5 votes vote down vote up
def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        preds = self.clf.predict(dtest)
        return preds


# Regressor Wrapper Class 
Example #30
Source File: wrap_xgb.py    From gestalt with MIT License 5 votes vote down vote up
def predict(self, X):
        dtest = xgb.DMatrix(X)
        preds = self.xgb.predict(dtest)
        return preds