Python Examples of xgboost.DMatrix

Source File: runner.py From ai-platform with MIT License

8 votes

def fit(self):
        """
        Gets data and preprocess by prepare_data() function
        Trains with the selected parameters from grid search and saves the model
        """
        data = self.get_input()
        df_train, df_test = self.prepare_data(data)
        xtr, ytr = df_train.drop(['Value'], axis=1), df_train['Value'].values

        xgbtrain = xgb.DMatrix(xtr, ytr)
        reg_cv = self.grid_search(xtr, ytr)
        param = reg_cv.best_params_
        bst = xgb.train(dtrain=xgbtrain, params=param)

        # save model to file
        mlflow.sklearn.save_model(bst, "model")
        return df_test

Source File: models.py From steppy-toolkit with MIT License

7 votes

def fit(self, X, y, X_valid, y_valid):
        logger.info('XGBoost, train data shape        {}'.format(X.shape))
        logger.info('XGBoost, validation data shape   {}'.format(X_valid.shape))
        logger.info('XGBoost, train labels shape      {}'.format(y.shape))
        logger.info('XGBoost, validation labels shape {}'.format(y_valid.shape))

        train = xgb.DMatrix(data=X,
                            label=y,
                            **self.dmatrix_parameters)
        valid = xgb.DMatrix(data=X_valid,
                            label=y_valid,
                            **self.dmatrix_parameters)
        self.estimator = xgb.train(params=self.booster_parameters,
                                   dtrain=train,
                                   evals=[(train, 'train'), (valid, 'valid')],
                                   **self.training_parameters)
        return self

Source File: test_transform.py From pydatalab with Apache License 2.0

6 votes

def test_local_csv_transform(self):
    """Test transfrom from local csv files."""

    cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
           '--csv=' + self.csv_input_filepath,
           '--analysis=' + self.analysis_dir,
           '--prefix=features',
           '--output=' + self.output_dir]
    print('cmd ', ' '.join(cmd))
    subprocess.check_call(' '.join(cmd), shell=True)

    # Verify transformed file.
    libsvm_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.libsvm')
    dtrain = xgb.DMatrix(libsvm_filepath)
    self.assertTrue(2056, dtrain.num_col())
    self.assertTrue(3, dtrain.num_row())

    # Verify featuremap file.
    featuremap_filepath = os.path.join(self.output_dir, 'featuremap-00000-of-00001.txt')
    df = pd.read_csv(featuremap_filepath, names=['index', 'description'])
    pd.util.testing.assert_series_equal(pd.Series(range(1, 2056), name='index'), df['index'])
    expected_descriptions = ['cat_col=Sunday', 'cat_col=Monday', 'img_col image feature 1000',
                             'num_col', 'text_col has "blue"']
    self.assertTrue(all(x in df['description'].values for x in expected_descriptions))

Source File: runner.py From ai-platform with MIT License

6 votes

def predict(self, df_test):
        """
         Makes prediction for the next 7 days electricity consumption.
        """
        # load model from file
        loaded_model = mlflow.sklearn.load_model("model")
        # make predictions for test data
        xts, yts = df_test.drop(['Value'], axis=1), df_test['Value'].values
        p = loaded_model.predict(xgb.DMatrix(xts))
        prediction = pd.DataFrame({'Prediction': p})

        mape, rmse, mae, r2 = ForecastRunner.evaluation_metrics(yts, p)
        print('MAPE: {}'.format(mape))
        print('RMSE: {}'.format(rmse))
        print('R2: {}'.format(r2))
        print('MAE: {}'.format(mae))
        mlflow.log_metric("MAPE", mape)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)
        mlflow.log_metric("MAE", mae)
        ForecastRunner.plot_result(yts, p)
        self.save_output(df_test, prediction)

Source File: predict.py From mars with Apache License 2.0

6 votes

def execute(cls, ctx, op):
        from xgboost import DMatrix

        raw_data = data = ctx[op.data.key]
        if isinstance(data, tuple):
            data = ToDMatrix.get_xgb_dmatrix(data)
        else:
            data = DMatrix(data)
        result = op.model.predict(data)

        if isinstance(op.outputs[0], DATAFRAME_CHUNK_TYPE):
            result = pd.DataFrame(result, index=raw_data.index)
        elif isinstance(op.outputs[0], SERIES_CHUNK_TYPE):
            result = pd.Series(result, index=raw_data.index, name='predictions')

        ctx[op.outputs[0].key] = result

Source File: test_boosted_trees_classifier.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston
        import numpy as np

        scikit_data = load_boston()
        t = scikit_data.target
        target = np.digitize(t, np.histogram(t)[1]) - 1
        dtrain = xgboost.DMatrix(
            scikit_data.data, label=target, feature_names=scikit_data.feature_names
        )
        self.xgb_model = xgboost.train({}, dtrain)
        self.target = target

        # Save the data and the model
        self.scikit_data = scikit_data
        self.n_classes = len(np.unique(self.target))

Source File: ch06-06-wrapper.py From kagglebook with BSD 3-Clause "New" or "Revised" License

6 votes

def evaluate(features):
    dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
    dvalid = xgb.DMatrix(va_x[features], label=va_y)
    params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
    num_round = 10  # 実際にはもっと多いround数が必要
    early_stopping_rounds = 3
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, dtrain, num_round,
                      evals=watchlist, early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=0)
    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)

    return score


# ---------------------------------
# Greedy Forward Selection
# ----------------------------------

Source File: test_boosted_trees_regression.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not _HAS_XGBOOST:
            return
        if not _HAS_SKLEARN:
            return

        scikit_data = load_boston()
        dtrain = xgboost.DMatrix(
            scikit_data.data,
            label=scikit_data.target,
            feature_names=scikit_data.feature_names,
        )
        xgb_model = xgboost.train({}, dtrain, 1)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.xgb_model = xgb_model
        self.feature_names = self.scikit_data.feature_names

Source File: model_xgb.py From kagglebook with BSD 3-Clause "New" or "Revised" License

6 votes

def train(self, tr_x, tr_y, va_x=None, va_y=None):

        # データのセット
        validation = va_x is not None
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        if validation:
            dvalid = xgb.DMatrix(va_x, label=va_y)

        # ハイパーパラメータの設定
        params = dict(self.params)
        num_round = params.pop('num_round')

        # 学習
        if validation:
            early_stopping_rounds = params.pop('early_stopping_rounds')
            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
            self.model = xgb.train(params, dtrain, num_round, evals=watchlist,
                                   early_stopping_rounds=early_stopping_rounds)
        else:
            watchlist = [(dtrain, 'train')]
            self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

Source File: encoder.py From sagemaker-xgboost-container with Apache License 2.0

6 votes

def libsvm_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
    """Convert a LIBSVM string representation to a DMatrix object.
    Args:
        string_like (bytes): LIBSVM string.
    Returns:
        (xgb.DMatrix): XGBoost DataMatrix
    """
    temp_file_location = None
    try:
        with tempfile.NamedTemporaryFile(delete=False) as libsvm_file:
            temp_file_location = libsvm_file.name
            libsvm_file.write(string_like)

        dmatrix = xgb.DMatrix(temp_file_location)
    finally:
        if temp_file_location and os.path.exists(temp_file_location):
            os.remove(temp_file_location)

    return dmatrix

Source File: level2.py From kaggle-kuzushiji-2019 with MIT License

6 votes

def train_xgb(train_features, train_y, valid_features, valid_y, *,
              eta, num_boost_round):
    train_data = xgb.DMatrix(train_features, label=train_y)
    valid_data = xgb.DMatrix(valid_features, label=valid_y)
    params = {
        'eta': eta,
        'objective': 'binary:logistic',
        'gamma': 0.01,
        'max_depth': 8,
    }
    print(params)
    eval_list = [(valid_data, 'eval')]
    return xgb.train(
        params, train_data, num_boost_round, eval_list,
        early_stopping_rounds=20,
        verbose_eval=10,
    )

Source File: wrap_xgb.py From gestalt with MIT License

6 votes

def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 verbose_eval=self.verbose)
        return

Source File: data_utils.py From sagemaker-xgboost-container with Apache License 2.0

6 votes

def _get_parquet_dmatrix_file_mode(files_path):
    """Get Data Matrix from parquet data in file mode.

    :param files_path: File path where parquet formatted training data resides, either directory or file
    :return: xgb.DMatrix
    """
    try:
        table = pq.read_table(files_path)

        data = table.to_pandas()
        del table

        if type(data) is pd.DataFrame:
            # pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame
            data = data.to_numpy()

        dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
        del data

        return dmatrix

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))

Source File: data_utils.py From sagemaker-xgboost-container with Apache License 2.0

6 votes

def get_libsvm_dmatrix(files_path, is_pipe=False):
    """Get DMatrix from libsvm file path.

    Pipe mode not currently supported for libsvm.

    :param files_path: File path where LIBSVM formatted training data resides, either directory or file
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :return: xgb.DMatrix
    """
    if is_pipe:
        raise exc.UserError("Pipe mode not supported for LibSVM.")

    try:
        dmatrix = xgb.DMatrix(files_path)
    except Exception as e:
        raise exc.UserError("Failed to load libsvm data with exception:\n{}".format(e))

    return dmatrix

Source File: test_boosted_trees_regression_numeric.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def setUpClass(self):
        if not _HAS_XGBOOST:
            return
        if not _HAS_SKLEARN:
            return

        # Load data and train model
        scikit_data = load_boston()
        self.X = scikit_data.data.astype("f").astype("d")
        self.dtrain = xgboost.DMatrix(
            scikit_data.data,
            label=scikit_data.target,
            feature_names=scikit_data.feature_names,
        )
        self.feature_names = scikit_data.feature_names
        self.output_name = "target"

Source File: train_xgboost.py From jh-kaggle-util with Apache License 2.0

6 votes

def train_model(self, x_train, y_train, x_val, y_val):
        print("Will train XGB for {} rounds, RandomSeed: {}".format(self.rounds, self.params['seed']))

        xg_train = xgb.DMatrix(x_train, label=y_train)

        if y_val is None:
            watchlist = [(xg_train, 'train')]
            model = xgb.train(self.params, xg_train, self.rounds, watchlist)
        else:
            early_stop = self.rounds if self.early_stop == 0 else self.early_stop
            xg_val = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(xg_train, 'train'), (xg_val, 'eval')]
            model = xgb.train(self.params, xg_train, self.rounds, watchlist, early_stopping_rounds=early_stop)

        self.steps = model.best_iteration
        return model

Source File: wrap_xgb.py From gestalt with MIT License

6 votes

def fit(self, X, y, x_val=None, y_val=None):

        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist,
                                 verbose_eval=self.verbose)
        else:
            self.clf = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return

Source File: dataset.py From ebonite with Apache License 2.0

5 votes

def deserialize(self, obj: list) -> xgboost.DMatrix:
        try:
            return xgboost.DMatrix(obj)
        except (ValueError, TypeError):
            raise DeserializationError(f'given object: {obj} could not be converted to xgboost matrix')

Source File: xgbranker.py From xgboostExtension with Apache License 2.0

5 votes

def predict(self, X, output_margin=False, ntree_limit=0):
        X = check_array(X, accept_sparse=True)

        test_dmatrix = DMatrix(X[:,1:], missing=self.missing)
        rank_values = self.get_booster().predict(test_dmatrix,
                                                 output_margin=output_margin,
                                                 ntree_limit=ntree_limit)
        return rank_values

Source File: ch05-01-validation.py From kagglebook with BSD 3-Clause "New" or "Revised" License

5 votes

def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred


# -----------------------------------
# hold-out法
# -----------------------------------
# hold-out法でのバリデーションデータの分割

Source File: xgboost_models.py From gentun with Apache License 2.0

5 votes

def cross_validate(self):
        """Train model using k-fold cross validation and
        return mean value of validation metric.
        """
        d_train = xgb.DMatrix(self.x_train, label=self.y_train)
        # xgb calls its k-fold cross-validation parameter 'nfold'
        cv_result = xgb.cv(
            self.params, d_train, num_boost_round=self.num_boost_round,
            early_stopping_rounds=self.early_stopping_rounds, nfold=self.kfold
        )
        return cv_result['test-{}-mean'.format(self.eval_metric)][-1]

Source File: ch05-01-validation.py From kagglebook with BSD 3-Clause "New" or "Revised" License

5 votes

def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

Source File: dataset.py From ebonite with Apache License 2.0

5 votes

def process(self, obj: xgboost.DMatrix, **kwargs) -> DatasetType:
        return DMatrixDatasetType.from_dmatrix(obj)

Source File: test_dataset.py From ebonite with Apache License 2.0

5 votes

def test_deserialize__np(dtype_np, np_payload):
    dmatrix = dtype_np.deserialize(np_payload)
    assert isinstance(dmatrix, xgboost.DMatrix)

Source File: dataset.py From ebonite with Apache License 2.0

5 votes

def from_dmatrix(cls, dmatrix: xgboost.DMatrix):
        """
        Factory method to extract :class:`~.DatasetType` from actual xgboost.DMatrix

        :param dmatrix: obj to create :class:`~.DatasetType` from
        :return: :class:`DMatrixDatasetType`
        """
        is_from_list = (dmatrix.feature_names == [f'f{i}' for i in range(dmatrix.num_col())])
        return DMatrixDatasetType(is_from_list, dmatrix.feature_types, dmatrix.feature_names)

Source File: dataset.py From ebonite with Apache License 2.0

5 votes

def serialize(self, instance: xgboost.DMatrix) -> list:
        """
        Raises an error because there is no way to extract original data from DMatrix
        """
        raise SerializationError('xgboost matrix does not support serialization')

Source File: model.py From ebonite with Apache License 2.0

5 votes

def _predict(self, data):
        if not isinstance(data, xgboost.DMatrix):
            data = xgboost.DMatrix(data)
        return self.model.predict(data)

Source File: xgboost.py From mljar-supervised with MIT License

5 votes

def predict(self, X):
        if self.model is None:
            raise XgbAlgorithmException("Xgboost model is None")

        dtrain = xgb.DMatrix(X, missing=np.NaN)
        a = self.model.predict(dtrain, ntree_limit=self.best_ntree_limit)
        return a

Source File: wrap_xgb.py From gestalt with MIT License

5 votes

def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        preds = self.clf.predict(dtest)
        return preds


# Regressor Wrapper Class

Source File: wrap_xgb.py From gestalt with MIT License

5 votes

def predict(self, X):
        dtest = xgb.DMatrix(X)
        preds = self.xgb.predict(dtest)
        return preds

Python xgboost.DMatrix() Examples