Python Examples of sklearn.model_selection.train_test

Source File: multi_class_classification.py From edge2vec with BSD 3-Clause "New" or "Revised" License

11 votes

def multi_class_classification(data_X,data_Y):
    '''
    calculate multi-class classification and return related evaluation metrics
    '''

    svc = svm.SVC(C=1, kernel='linear')
    # X_train, X_test, y_train, y_test = train_test_split( data_X, data_Y, test_size=0.4, random_state=0) 
    clf = svc.fit(data_X, data_Y) #svm
    # array = svc.coef_
    # print array
    predicted = cross_val_predict(clf, data_X, data_Y, cv=2)
    print "accuracy",metrics.accuracy_score(data_Y, predicted)
    print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') 
    print "f1 score micro",metrics.f1_score(data_Y, predicted, average='micro') 
    print "precision score",metrics.precision_score(data_Y, predicted, average='macro') 
    print "recall score",metrics.recall_score(data_Y, predicted, average='macro') 
    print "hamming_loss",metrics.hamming_loss(data_Y, predicted)
    print "classification_report", metrics.classification_report(data_Y, predicted)
    print "jaccard_similarity_score", metrics.jaccard_similarity_score(data_Y, predicted)
    # print "log_loss", metrics.log_loss(data_Y, predicted)
    print "zero_one_loss", metrics.zero_one_loss(data_Y, predicted)
    # print "AUC&ROC",metrics.roc_auc_score(data_Y, predicted)
    # print "matthews_corrcoef", metrics.matthews_corrcoef(data_Y, predicted)

Source File: mmbot.py From MaliciousMacroBot with MIT License

8 votes

def mmb_evaluate_model(self):
        """
        Returns scores from cross validation evaluation on the malicious / benign classifier
        """
        predictive_features = self.features['predictive_features']
        self.clf_X = self.modeldata[predictive_features].values
        self.clf_y = np.array(self.modeldata['label'])

        X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
        lb = LabelBinarizer()
        y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
        eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
        eval_cls.fit(X_train, y_train)

        recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
        precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
        accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
        f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')

        return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}

Source File: test_shap.py From AIX360 with Apache License 2.0

8 votes

def test_ShapLinearExplainer(self):
        corpus, y = shap.datasets.imdb()
        corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7)

        vectorizer = TfidfVectorizer(min_df=10)
        X_train = vectorizer.fit_transform(corpus_train)
        X_test = vectorizer.transform(corpus_test)

        model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
        model.fit(X_train, y_train)

        shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent")
        shap_values = shapexplainer.explain_instance(X_test)
        print("Invoked Shap LinearExplainer")

    # comment this test as travis runs out of resources

Source File: ml_elm.py From Python-ELM with MIT License

8 votes

def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import train_test_split

    db_name = 'diabetes'
    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)

    tmp = data_set.target
    tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
    data_set.target = tmpL

    X_train, X_test, y_train, y_test = train_test_split(
        data_set.data, data_set.target, test_size=0.4)

    mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
    elm = ELM(200).fit(X_train, y_train)

    print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
    print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))

Source File: automl.py From Kaggler with MIT License

7 votes

def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        def objective(hyperparams):
            model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams)
            model.fit(X=X_trn, y=y_trn,
                      eval_set=[(X_val, y_val)],
                      eval_metric=self.metric,
                      early_stopping_rounds=self.n_stop,
                      verbose=False)
            score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials

Source File: automl.py From Kaggler with MIT License

7 votes

def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        train_data = lgb.Dataset(X_trn, label=y_trn)
        valid_data = lgb.Dataset(X_val, label=y_val)

        def objective(hyperparams):
            model = lgb.train({**self.params, **hyperparams}, train_data, self.n_est,
                              valid_data, early_stopping_rounds=self.n_stop, verbose_eval=0)

            score = model.best_score["valid_0"][self.metric] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials

Source File: label_digits.py From libact with BSD 2-Clause "Simplified" License

7 votes

def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits

Source File: pipeline.py From qb with MIT License

6 votes

def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD]

        guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f)

Source File: test_GaussianNB.py From differential-privacy-library with MIT License

6 votes

def test_with_iris(self):
        global_seed(12345)
        from sklearn import datasets
        dataset = datasets.load_iris()

        x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=.2)

        bounds = ([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5])

        clf = GaussianNB(epsilon=5.0, bounds=bounds)
        clf.fit(x_train, y_train)

        accuracy = clf.score(x_test, y_test)
        counts = clf.class_count_.copy()
        self.assertGreater(accuracy, 0.5)

        clf.partial_fit(x_train, y_train)
        new_counts = clf.class_count_
        self.assertEqual(np.sum(new_counts), np.sum(counts) * 2)

Source File: utils.py From cloudml-samples with Apache License 2.0

6 votes

def data_train_test_split(data_df):
  """Split the DataFrame two subsets for training and testing.

  Args:
    data_df: (pandas.DataFrame) DataFrame the splitting to be performed on

  Returns:
    A Tuple of (pandas.DataFrame, pandas.Series,
                pandas.DataFrame, pandas.Series)
  """

  label_column = metadata.LABEL
  # Only use metadata.FEATURE_COLUMNS + metadata.LABEL
  columns_to_use = metadata.FEATURE_COLUMNS + [label_column]

  train, val = model_selection.train_test_split(data_df[columns_to_use])
  x_train, y_train = _feature_label_split(train, label_column)
  x_val, y_val = _feature_label_split(val, label_column)
  return x_train, y_train, x_val, y_val

Source File: common_utils.py From interpret-text with MIT License

6 votes

def create_simple_titanic_data():
    titanic_url = (
        "https://raw.githubusercontent.com/amueller/"
        "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
    )
    data = read_csv(titanic_url)
    # fill missing values
    data = data.fillna(method="ffill")
    data = data.fillna(method="bfill")
    numeric_features = ["age", "fare"]
    categorical_features = ["embarked", "sex", "pclass"]

    y = data["survived"].values
    X = data[categorical_features + numeric_features]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test, numeric_features, categorical_features

Source File: test_LinearRegression.py From differential-privacy-library with MIT License

6 votes

def test_same_results(self):
        from sklearn import datasets
        from sklearn.model_selection import train_test_split
        from sklearn import linear_model

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LinearRegression(data_norm=12, epsilon=float("inf"),
                               bounds_X=([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = linear_model.LinearRegression(normalize=False)
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        self.assertTrue(np.allclose(predict1, predict2))

Source File: graph_eval.py From nodevectors with MIT License

6 votes

def print_labeled_tests(w, y, test_size=0.2, seed=42):
    """
    Clustering and label prediction tests
    """
    X_train, X_test, y_train, y_test = train_test_split(
        w, y, test_size=test_size, random_state=seed)
    # Print Label Prediction Tests
    res = LabelPrediction(w, y, test_size=test_size, seed=seed)
    # Can only cluster on single-label (not multioutput)
    if len(y.shape) < 2:
        n_clusters = np.unique(y).size
        umpagglo = cluster.AgglomerativeClustering(
            n_clusters=n_clusters, 
            affinity='cosine', 
            linkage='average'
        ).fit(w).labels_
        x = evalClusteringOnLabels(umpagglo, y, verbose=True)
        res = {**res, **x}
    return res

Source File: test_LinearRegression.py From differential-privacy-library with MIT License

6 votes

def test_different_results(self):
        from sklearn import datasets
        from sklearn import linear_model
        from sklearn.model_selection import train_test_split

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        clf = linear_model.LinearRegression()
        clf.fit(X_train, y_train)

        predict3 = clf.predict(X_test)

        self.assertFalse(np.all(predict1 == predict2))
        self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2))

Source File: common_utils.py From interpret-text with MIT License

6 votes

def create_cancer_data():
    # Import cancer dataset
    cancer = (
        retrieve_dataset("breast-cancer.train.csv", na_values="?")
        .interpolate()
        .astype("int64")
    )
    cancer_target = cancer.iloc[:, 0]
    cancer_data = cancer.iloc[:, 1:]
    feature_names = cancer_data.columns.values
    target_names = ["no_cancer", "cancer"]
    # Split data into train and test
    x_train, x_test, y_train, y_validation = train_test_split(
        cancer_data, cancer_target, test_size=0.2, random_state=0
    )
    return x_train, x_test, y_train, y_validation, feature_names, target_names

Source File: test_LogisticRegression.py From differential-privacy-library with MIT License

6 votes

def test_same_results(self):
        from sklearn import datasets
        from sklearn.model_selection import train_test_split
        from sklearn import linear_model

        dataset = datasets.load_iris()
        X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)

        clf = LogisticRegression(data_norm=12, epsilon=float("inf"))
        clf.fit(X_train, y_train)

        predict1 = clf.predict(X_test)

        clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr")
        clf.fit(X_train, y_train)

        predict2 = clf.predict(X_test)

        self.assertTrue(np.all(predict1 == predict2))

Source File: main_data_engine.py From TripletLossFace with MIT License

6 votes

def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False):
		print("Creating TensorFlow dataset object...")
		if type(test_data) != tuple:
			print("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + 
				f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.")

			time.sleep(5)


		paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42)
		print("Dataset splitted by system, please make sure this is what you want.")

		dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train))
		dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test))
		print("TensorFlow dataset object created!")

		if not supportive:
			self.dataset_train = dataset_train
			self.dataset_test = dataset_test

		return dataset_train, dataset_test

Source File: main_data_engine.py From TripletLossFace with MIT License

6 votes

def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False):
		print("Creating TensorFlow dataset object...")
		if type(test_data) != tuple:
			printl("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + 
				f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.")

			time.sleep(5)


		paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42)
		print("Dataset splitted by system, please make sure this is what you want.")

		dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train))
		dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test))
		print("TensorFlow dataset object created!")

		if not supportive:
			self.dataset_train = dataset_train
			self.dataset_test = dataset_test

		return dataset_train, dataset_test

Source File: audio_util.py From Tensorflow-Audio-Classification with Apache License 2.0

6 votes

def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True):
    """Split dataset into train/val/test subsets by 70:20:10(default).
    
    Args:
      X: List of data.
      Y: List of labels corresponding to data.
      split: Tuple of split ratio in `test:val` order.
      shuffle: Bool of shuffle or not.
      
    Returns:
      Three dataset in `train:test:val` order.
    """
    from sklearn.model_selection import train_test_split
    assert len(X) == len(Y), 'The length of X and Y must be consistent.'
    X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, 
        test_size=(split[0]+split[1]), shuffle=shuffle)
    X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, 
        test_size=split[1], shuffle=False)
    return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val)

Source File: run.py From fake-news-detection with MIT License

6 votes

def pipeline(args):
    '''
    Runs the model loop.
    '''
    df = pd.read_csv(args.filename)
    df.loc[:,args.x_label] = df[args.x_label].fillna("None")
    if args.dedupe:
        df = df.drop_duplicates(subset='content')
    if args.reduce:
        df = restrict_sources(df)
    X = df[args.x_label]
    y = df[args.y_label]
    parser = spacy.load('en')
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
                     args.iterations, args.output_dir,
                     thresholds = args.thresholds, ks = args.ks,
                     setting=args.features[0])
    loop.run()

Source File: learning_curve.py From dota2-predictor with MIT License

6 votes

def plot_learning_curve(x_train, y_train, subsets=20, mmr=None, cv=5, tool='matplotlib'):
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

    subset_sizes = np.exp(np.linspace(3, np.log(len(y_train)), subsets)).astype(int)

    results_list = [[], []]

    for subset_size in subset_sizes:
        logger.info('Performing cross validation on subset_size %d', subset_size)
        _, _, cv_score, roc_auc, _ = evaluate([x_train[:subset_size], y_train[:subset_size]],
                                              [x_test, y_test], cv=cv)

        results_list[0].append(1 - cv_score)
        results_list[1].append(1 - roc_auc)

    if tool == 'matplotlib':
        _plot_matplotlib(subset_sizes, results_list, mmr)
    else:
        _plot_plotly(subset_sizes, results_list, mmr)

Source File: models.py From xcessiv with Apache License 2.0

6 votes

def return_train_dataset(self):
        """Returns train data set

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        X, y = self.return_main_dataset()

        if self.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=self.test_dataset['split_ratio'],
                random_state=self.test_dataset['split_seed'],
                stratify=y
            )

        return X, y

Source File: make_dataset.py From snape with Apache License 2.0

6 votes

def write_dataset(df, file_name, out_path="." + os.path.sep):
    """
    Writes generated dataset to file

    :param df: dataframe to write
    :param file_name: beginning of filename
    :param out_path: the path to write the dataset
    :return: None
    """
    # todo: Mike, do we want to take a param for overwriting existing files?
    df_train, df_testkey = train_test_split(df, test_size=.2)

    df_train.to_csv(out_path + file_name + "_train.csv", index=False)
    df_test = df_testkey.drop(['y'], axis=1)
    df_test.to_csv(out_path + file_name + "_test.csv", index=False)
    df_testkey.to_csv(out_path + file_name + "_testkey.csv", index=False)

Source File: sgcn.py From SGCN with GNU General Public License v3.0

6 votes

def setup_dataset(self):
        """
        Creating train and test split.
        """
        self.positive_edges, self.test_positive_edges = train_test_split(self.edges["positive_edges"],
                                                                         test_size=self.args.test_size)

        self.negative_edges, self.test_negative_edges = train_test_split(self.edges["negative_edges"],
                                                                         test_size=self.args.test_size)
        self.ecount = len(self.positive_edges + self.negative_edges)

        self.X = setup_features(self.args,
                                self.positive_edges,
                                self.negative_edges,
                                self.edges["ncount"])

        self.positive_edges = torch.from_numpy(np.array(self.positive_edges,
                                                        dtype=np.int64).T).type(torch.long).to(self.device)

        self.negative_edges = torch.from_numpy(np.array(self.negative_edges,
                                                        dtype=np.int64).T).type(torch.long).to(self.device)

        self.y = np.array([0 if i < int(self.ecount/2) else 1 for i in range(self.ecount)]+[2]*(self.ecount*2))
        self.y = torch.from_numpy(self.y).type(torch.LongTensor).to(self.device)
        self.X = torch.from_numpy(self.X).float().to(self.device)

Source File: DataModule.py From sgd-influence with MIT License

6 votes

def fetch(self, n_tr, n_val, n_test, seed=0):
        x, y = self.load()
        
        # split data
        x_tr, x_val, y_tr, y_val = train_test_split(
            x, y, train_size=n_tr, test_size=n_val+n_test, random_state=seed)
        x_val, x_test, y_val, y_test = train_test_split(
            x_val, y_val, train_size=n_val, test_size=n_test, random_state=seed+1)
        
        # process x
        if self.normalize:
            scaler = StandardScaler()
            scaler.fit(x_tr)
            x_tr = scaler.transform(x_tr)
            x_val = scaler.transform(x_val)
            x_test = scaler.transform(x_test)
        if self.append_one:
            x_tr = np.c_[x_tr, np.ones(n_tr)]
            x_val = np.c_[x_val, np.ones(n_val)]
            x_test = np.c_[x_test, np.ones(n_test)]
        
        return (x_tr, y_tr), (x_val, y_val), (x_test, y_test)

Source File: datasets.py From kaggle-carvana-2017 with MIT License

6 votes

def bootstrapped_split(car_ids, seed=args.seed):
    """
    # Arguments
        metadata: metadata.csv provided by Carvana (should include
        `train` column).

    # Returns
        A tuple (train_ids, test_ids)
    """
    all_ids = pd.Series(car_ids)
    train_ids, valid_ids = train_test_split(car_ids, test_size=args.test_size_float,
                                                     random_state=seed)

    np.random.seed(seed)
    bootstrapped_idx = np.random.random_integers(0, len(train_ids))
    bootstrapped_train_ids = train_ids[bootstrapped_idx]

    return generate_filenames(bootstrapped_train_ids.values), generate_filenames(valid_ids)

Source File: test_classify_question.py From adam_qas with GNU General Public License v3.0

5 votes

def test_classify_question(self):
        training_data_path = os.path.join(CORPUS_DIR, QUESTION_CLASSIFICATION_TRAINING_DATA)
        df_question = pandas.read_csv(training_data_path, sep='|', header=0)
        df_question_train, df_question_test = train_test_split(df_question, test_size=0.2, random_state=42)

        predicted_class, clf, df_question_train_label, df_question_train = \
            classify_question(df_question_train=df_question_train, df_question_test=df_question_test)

        scores = cross_val_score(clf, df_question_train, df_question_train_label)

        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        print("SD:", scores.std())

        assert scores.mean() > self.classification_score

Source File: 07_magic.py From sacred with MIT License

5 votes

def run():
    X, y = datasets.load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2
    )
    clf = get_model()  # Parameters are injected automatically.
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

Source File: __init__.py From deepchem with MIT License

5 votes

def fit(self, dataset, **kwargs):
    """
    Fits XGBoost model to data.
    """
    X = dataset.X
    y = np.squeeze(dataset.y)
    w = np.squeeze(dataset.w)
    seed = self.model_instance.random_state
    import xgboost as xgb
    if isinstance(self.model_instance, xgb.XGBClassifier):
      xgb_metric = "auc"
      sklearn_metric = "roc_auc"
      stratify = y
    elif isinstance(self.model_instance, xgb.XGBRegressor):
      xgb_metric = "mae"
      sklearn_metric = "neg_mean_absolute_error"
      stratify = None
    best_param = self._search_param(sklearn_metric, X, y)
    # update model with best param
    self.model_instance = self.model_class(**best_param)

    # Find optimal n_estimators based on original learning_rate
    # and early_stopping_rounds
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=stratify)

    self.model_instance.fit(
        X_train,
        y_train,
        early_stopping_rounds=self.early_stopping_rounds,
        eval_metric=xgb_metric,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=self.verbose)
    # Since test size is 20%, when retrain model to whole data, expect
    # n_estimator increased to 1/0.8 = 1.25 time.
    estimated_best_round = np.round(self.model_instance.best_ntree_limit * 1.25)
    self.model_instance.n_estimators = np.int64(estimated_best_round)
    self.model_instance.fit(X, y, eval_metric=xgb_metric, verbose=self.verbose)

Source File: test_specified_index_splitter.py From deepchem with MIT License

5 votes

def test_split(self):
    ds = self.create_dataset()
    indexes = list(range(len(ds)))
    train, test = train_test_split(indexes)
    train, valid = train_test_split(train)

    splitter = SpecifiedIndexSplitter(train, valid, test)
    train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds)

    self.assertTrue(np.all(train_ds.X == ds.X[train]))
    self.assertTrue(np.all(valid_ds.X == ds.X[valid]))
    self.assertTrue(np.all(test_ds.X == ds.X[test]))

Python sklearn.model_selection.train_test_split() Examples