Python sklearn.model_selection.train_test_split() Examples
The following are 30
code examples of sklearn.model_selection.train_test_split().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: multi_class_classification.py From edge2vec with BSD 3-Clause "New" or "Revised" License | 11 votes |
def multi_class_classification(data_X,data_Y): ''' calculate multi-class classification and return related evaluation metrics ''' svc = svm.SVC(C=1, kernel='linear') # X_train, X_test, y_train, y_test = train_test_split( data_X, data_Y, test_size=0.4, random_state=0) clf = svc.fit(data_X, data_Y) #svm # array = svc.coef_ # print array predicted = cross_val_predict(clf, data_X, data_Y, cv=2) print "accuracy",metrics.accuracy_score(data_Y, predicted) print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') print "f1 score micro",metrics.f1_score(data_Y, predicted, average='micro') print "precision score",metrics.precision_score(data_Y, predicted, average='macro') print "recall score",metrics.recall_score(data_Y, predicted, average='macro') print "hamming_loss",metrics.hamming_loss(data_Y, predicted) print "classification_report", metrics.classification_report(data_Y, predicted) print "jaccard_similarity_score", metrics.jaccard_similarity_score(data_Y, predicted) # print "log_loss", metrics.log_loss(data_Y, predicted) print "zero_one_loss", metrics.zero_one_loss(data_Y, predicted) # print "AUC&ROC",metrics.roc_auc_score(data_Y, predicted) # print "matthews_corrcoef", metrics.matthews_corrcoef(data_Y, predicted)
Example #2
Source File: mmbot.py From MaliciousMacroBot with MIT License | 8 votes |
def mmb_evaluate_model(self): """ Returns scores from cross validation evaluation on the malicious / benign classifier """ predictive_features = self.features['predictive_features'] self.clf_X = self.modeldata[predictive_features].values self.clf_y = np.array(self.modeldata['label']) X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0) lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2) eval_cls.fit(X_train, y_train) recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall') precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision') accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy') f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro') return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}
Example #3
Source File: test_shap.py From AIX360 with Apache License 2.0 | 8 votes |
def test_ShapLinearExplainer(self): corpus, y = shap.datasets.imdb() corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7) vectorizer = TfidfVectorizer(min_df=10) X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear') model.fit(X_train, y_train) shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent") shap_values = shapexplainer.explain_instance(X_test) print("Invoked Shap LinearExplainer") # comment this test as travis runs out of resources
Example #4
Source File: ml_elm.py From Python-ELM with MIT License | 8 votes |
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_openml as fetch_mldata from sklearn.model_selection import train_test_split db_name = 'diabetes' data_set = fetch_mldata(db_name) data_set.data = preprocessing.normalize(data_set.data) tmp = data_set.target tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp] data_set.target = tmpL X_train, X_test, y_train, y_test = train_test_split( data_set.data, data_set.target, test_size=0.4) mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train) elm = ELM(200).fit(X_train, y_train) print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test)) print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))
Example #5
Source File: automl.py From Kaggler with MIT License | 7 votes |
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit(X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False) score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model} trials = Trials() best = hyperopt.fmin(fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, rstate=self.random_state) hyperparams = space_eval(self.space, best) return hyperparams, trials
Example #6
Source File: automl.py From Kaggler with MIT License | 7 votes |
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) train_data = lgb.Dataset(X_trn, label=y_trn) valid_data = lgb.Dataset(X_val, label=y_val) def objective(hyperparams): model = lgb.train({**self.params, **hyperparams}, train_data, self.n_est, valid_data, early_stopping_rounds=self.n_stop, verbose_eval=0) score = model.best_score["valid_0"][self.metric] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model} trials = Trials() best = hyperopt.fmin(fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, rstate=self.random_state) hyperparams = space_eval(self.space, best) return hyperparams, trials
Example #7
Source File: label_digits.py From libact with BSD 2-Clause "Simplified" License | 7 votes |
def split_train_test(n_classes): from sklearn.datasets import load_digits n_labeled = 5 digits = load_digits(n_class=n_classes) # consider binary case X = digits.data y = digits.target print(np.shape(X)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) while len(np.unique(y_train[:n_labeled])) < n_classes: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33) trn_ds = Dataset(X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds = Dataset(X_test, y_test) return trn_ds, tst_ds, digits
Example #8
Source File: pipeline.py From qb with MIT License | 6 votes |
def run(self): with open(QANTA_TRAIN_DATASET_PATH) as f: all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD] guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9) with open(QANTA_DEV_DATASET_PATH) as f: guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD] with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_train, DS_VERSION), f) with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_val, DS_VERSION), f) with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_dev, DS_VERSION), f)
Example #9
Source File: test_GaussianNB.py From differential-privacy-library with MIT License | 6 votes |
def test_with_iris(self): global_seed(12345) from sklearn import datasets dataset = datasets.load_iris() x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=.2) bounds = ([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5]) clf = GaussianNB(epsilon=5.0, bounds=bounds) clf.fit(x_train, y_train) accuracy = clf.score(x_test, y_test) counts = clf.class_count_.copy() self.assertGreater(accuracy, 0.5) clf.partial_fit(x_train, y_train) new_counts = clf.class_count_ self.assertEqual(np.sum(new_counts), np.sum(counts) * 2)
Example #10
Source File: utils.py From cloudml-samples with Apache License 2.0 | 6 votes |
def data_train_test_split(data_df): """Split the DataFrame two subsets for training and testing. Args: data_df: (pandas.DataFrame) DataFrame the splitting to be performed on Returns: A Tuple of (pandas.DataFrame, pandas.Series, pandas.DataFrame, pandas.Series) """ label_column = metadata.LABEL # Only use metadata.FEATURE_COLUMNS + metadata.LABEL columns_to_use = metadata.FEATURE_COLUMNS + [label_column] train, val = model_selection.train_test_split(data_df[columns_to_use]) x_train, y_train = _feature_label_split(train, label_column) x_val, y_val = _feature_label_split(val, label_column) return x_train, y_train, x_val, y_val
Example #11
Source File: common_utils.py From interpret-text with MIT License | 6 votes |
def create_simple_titanic_data(): titanic_url = ( "https://raw.githubusercontent.com/amueller/" "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv" ) data = read_csv(titanic_url) # fill missing values data = data.fillna(method="ffill") data = data.fillna(method="bfill") numeric_features = ["age", "fare"] categorical_features = ["embarked", "sex", "pclass"] y = data["survived"].values X = data[categorical_features + numeric_features] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) return X_train, X_test, y_train, y_test, numeric_features, categorical_features
Example #12
Source File: test_LinearRegression.py From differential-privacy-library with MIT License | 6 votes |
def test_same_results(self): from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import linear_model dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2) clf = LinearRegression(data_norm=12, epsilon=float("inf"), bounds_X=([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2)) clf.fit(X_train, y_train) predict1 = clf.predict(X_test) clf = linear_model.LinearRegression(normalize=False) clf.fit(X_train, y_train) predict2 = clf.predict(X_test) self.assertTrue(np.allclose(predict1, predict2))
Example #13
Source File: graph_eval.py From nodevectors with MIT License | 6 votes |
def print_labeled_tests(w, y, test_size=0.2, seed=42): """ Clustering and label prediction tests """ X_train, X_test, y_train, y_test = train_test_split( w, y, test_size=test_size, random_state=seed) # Print Label Prediction Tests res = LabelPrediction(w, y, test_size=test_size, seed=seed) # Can only cluster on single-label (not multioutput) if len(y.shape) < 2: n_clusters = np.unique(y).size umpagglo = cluster.AgglomerativeClustering( n_clusters=n_clusters, affinity='cosine', linkage='average' ).fit(w).labels_ x = evalClusteringOnLabels(umpagglo, y, verbose=True) res = {**res, **x} return res
Example #14
Source File: test_LinearRegression.py From differential-privacy-library with MIT License | 6 votes |
def test_different_results(self): from sklearn import datasets from sklearn import linear_model from sklearn.model_selection import train_test_split dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2) clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2)) clf.fit(X_train, y_train) predict1 = clf.predict(X_test) clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2)) clf.fit(X_train, y_train) predict2 = clf.predict(X_test) clf = linear_model.LinearRegression() clf.fit(X_train, y_train) predict3 = clf.predict(X_test) self.assertFalse(np.all(predict1 == predict2)) self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2))
Example #15
Source File: common_utils.py From interpret-text with MIT License | 6 votes |
def create_cancer_data(): # Import cancer dataset cancer = ( retrieve_dataset("breast-cancer.train.csv", na_values="?") .interpolate() .astype("int64") ) cancer_target = cancer.iloc[:, 0] cancer_data = cancer.iloc[:, 1:] feature_names = cancer_data.columns.values target_names = ["no_cancer", "cancer"] # Split data into train and test x_train, x_test, y_train, y_validation = train_test_split( cancer_data, cancer_target, test_size=0.2, random_state=0 ) return x_train, x_test, y_train, y_validation, feature_names, target_names
Example #16
Source File: test_LogisticRegression.py From differential-privacy-library with MIT License | 6 votes |
def test_same_results(self): from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn import linear_model dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2) clf = LogisticRegression(data_norm=12, epsilon=float("inf")) clf.fit(X_train, y_train) predict1 = clf.predict(X_test) clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr") clf.fit(X_train, y_train) predict2 = clf.predict(X_test) self.assertTrue(np.all(predict1 == predict2))
Example #17
Source File: main_data_engine.py From TripletLossFace with MIT License | 6 votes |
def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False): print("Creating TensorFlow dataset object...") if type(test_data) != tuple: print("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.") time.sleep(5) paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42) print("Dataset splitted by system, please make sure this is what you want.") dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train)) dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test)) print("TensorFlow dataset object created!") if not supportive: self.dataset_train = dataset_train self.dataset_test = dataset_test return dataset_train, dataset_test
Example #18
Source File: main_data_engine.py From TripletLossFace with MIT License | 6 votes |
def create_tensorflow_dataset_object(self, paths, labels, test_rate: float = 0.1, test_data: tuple = (None, None), supportive: bool = False): print("Creating TensorFlow dataset object...") if type(test_data) != tuple: printl("\"test_data\" must be tuple for 'create_tensorflow_dataset_object', test data will be taken from real data " + f"with rate of {test_rate}. You have 5 seconds to stop the process and cancel running. Use Ctrl+C to do that.") time.sleep(5) paths_train, paths_test, labels_train, labels_test = train_test_split(paths, labels, test_size=test_rate, random_state=42) print("Dataset splitted by system, please make sure this is what you want.") dataset_train = tf.data.Dataset.from_tensor_slices((paths_train, labels_train)).shuffle(len(labels_train)) dataset_test = tf.data.Dataset.from_tensor_slices((paths_test, labels_test)).shuffle(len(labels_test)) print("TensorFlow dataset object created!") if not supportive: self.dataset_train = dataset_train self.dataset_test = dataset_test return dataset_train, dataset_test
Example #19
Source File: audio_util.py From Tensorflow-Audio-Classification with Apache License 2.0 | 6 votes |
def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True): """Split dataset into train/val/test subsets by 70:20:10(default). Args: X: List of data. Y: List of labels corresponding to data. split: Tuple of split ratio in `test:val` order. shuffle: Bool of shuffle or not. Returns: Three dataset in `train:test:val` order. """ from sklearn.model_selection import train_test_split assert len(X) == len(Y), 'The length of X and Y must be consistent.' X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, test_size=(split[0]+split[1]), shuffle=shuffle) X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, test_size=split[1], shuffle=False) return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val)
Example #20
Source File: run.py From fake-news-detection with MIT License | 6 votes |
def pipeline(args): ''' Runs the model loop. ''' df = pd.read_csv(args.filename) df.loc[:,args.x_label] = df[args.x_label].fillna("None") if args.dedupe: df = df.drop_duplicates(subset='content') if args.reduce: df = restrict_sources(df) X = df[args.x_label] y = df[args.y_label] parser = spacy.load('en') X_train, X_test, y_train, y_test = train_test_split(X, y) loop = ModelLoop(X_train, X_test, y_train, y_test, args.models, args.iterations, args.output_dir, thresholds = args.thresholds, ks = args.ks, setting=args.features[0]) loop.run()
Example #21
Source File: learning_curve.py From dota2-predictor with MIT License | 6 votes |
def plot_learning_curve(x_train, y_train, subsets=20, mmr=None, cv=5, tool='matplotlib'): x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2) subset_sizes = np.exp(np.linspace(3, np.log(len(y_train)), subsets)).astype(int) results_list = [[], []] for subset_size in subset_sizes: logger.info('Performing cross validation on subset_size %d', subset_size) _, _, cv_score, roc_auc, _ = evaluate([x_train[:subset_size], y_train[:subset_size]], [x_test, y_test], cv=cv) results_list[0].append(1 - cv_score) results_list[1].append(1 - roc_auc) if tool == 'matplotlib': _plot_matplotlib(subset_sizes, results_list, mmr) else: _plot_plotly(subset_sizes, results_list, mmr)
Example #22
Source File: models.py From xcessiv with Apache License 2.0 | 6 votes |
def return_train_dataset(self): """Returns train data set Returns: X (numpy.ndarray): Features y (numpy.ndarray): Labels """ X, y = self.return_main_dataset() if self.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=self.test_dataset['split_ratio'], random_state=self.test_dataset['split_seed'], stratify=y ) return X, y
Example #23
Source File: make_dataset.py From snape with Apache License 2.0 | 6 votes |
def write_dataset(df, file_name, out_path="." + os.path.sep): """ Writes generated dataset to file :param df: dataframe to write :param file_name: beginning of filename :param out_path: the path to write the dataset :return: None """ # todo: Mike, do we want to take a param for overwriting existing files? df_train, df_testkey = train_test_split(df, test_size=.2) df_train.to_csv(out_path + file_name + "_train.csv", index=False) df_test = df_testkey.drop(['y'], axis=1) df_test.to_csv(out_path + file_name + "_test.csv", index=False) df_testkey.to_csv(out_path + file_name + "_testkey.csv", index=False)
Example #24
Source File: sgcn.py From SGCN with GNU General Public License v3.0 | 6 votes |
def setup_dataset(self): """ Creating train and test split. """ self.positive_edges, self.test_positive_edges = train_test_split(self.edges["positive_edges"], test_size=self.args.test_size) self.negative_edges, self.test_negative_edges = train_test_split(self.edges["negative_edges"], test_size=self.args.test_size) self.ecount = len(self.positive_edges + self.negative_edges) self.X = setup_features(self.args, self.positive_edges, self.negative_edges, self.edges["ncount"]) self.positive_edges = torch.from_numpy(np.array(self.positive_edges, dtype=np.int64).T).type(torch.long).to(self.device) self.negative_edges = torch.from_numpy(np.array(self.negative_edges, dtype=np.int64).T).type(torch.long).to(self.device) self.y = np.array([0 if i < int(self.ecount/2) else 1 for i in range(self.ecount)]+[2]*(self.ecount*2)) self.y = torch.from_numpy(self.y).type(torch.LongTensor).to(self.device) self.X = torch.from_numpy(self.X).float().to(self.device)
Example #25
Source File: DataModule.py From sgd-influence with MIT License | 6 votes |
def fetch(self, n_tr, n_val, n_test, seed=0): x, y = self.load() # split data x_tr, x_val, y_tr, y_val = train_test_split( x, y, train_size=n_tr, test_size=n_val+n_test, random_state=seed) x_val, x_test, y_val, y_test = train_test_split( x_val, y_val, train_size=n_val, test_size=n_test, random_state=seed+1) # process x if self.normalize: scaler = StandardScaler() scaler.fit(x_tr) x_tr = scaler.transform(x_tr) x_val = scaler.transform(x_val) x_test = scaler.transform(x_test) if self.append_one: x_tr = np.c_[x_tr, np.ones(n_tr)] x_val = np.c_[x_val, np.ones(n_val)] x_test = np.c_[x_test, np.ones(n_test)] return (x_tr, y_tr), (x_val, y_val), (x_test, y_test)
Example #26
Source File: datasets.py From kaggle-carvana-2017 with MIT License | 6 votes |
def bootstrapped_split(car_ids, seed=args.seed): """ # Arguments metadata: metadata.csv provided by Carvana (should include `train` column). # Returns A tuple (train_ids, test_ids) """ all_ids = pd.Series(car_ids) train_ids, valid_ids = train_test_split(car_ids, test_size=args.test_size_float, random_state=seed) np.random.seed(seed) bootstrapped_idx = np.random.random_integers(0, len(train_ids)) bootstrapped_train_ids = train_ids[bootstrapped_idx] return generate_filenames(bootstrapped_train_ids.values), generate_filenames(valid_ids)
Example #27
Source File: test_classify_question.py From adam_qas with GNU General Public License v3.0 | 5 votes |
def test_classify_question(self): training_data_path = os.path.join(CORPUS_DIR, QUESTION_CLASSIFICATION_TRAINING_DATA) df_question = pandas.read_csv(training_data_path, sep='|', header=0) df_question_train, df_question_test = train_test_split(df_question, test_size=0.2, random_state=42) predicted_class, clf, df_question_train_label, df_question_train = \ classify_question(df_question_train=df_question_train, df_question_test=df_question_test) scores = cross_val_score(clf, df_question_train, df_question_train_label) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print("SD:", scores.std()) assert scores.mean() > self.classification_score
Example #28
Source File: 07_magic.py From sacred with MIT License | 5 votes |
def run(): X, y = datasets.load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2 ) clf = get_model() # Parameters are injected automatically. clf.fit(X_train, y_train) return clf.score(X_test, y_test)
Example #29
Source File: __init__.py From deepchem with MIT License | 5 votes |
def fit(self, dataset, **kwargs): """ Fits XGBoost model to data. """ X = dataset.X y = np.squeeze(dataset.y) w = np.squeeze(dataset.w) seed = self.model_instance.random_state import xgboost as xgb if isinstance(self.model_instance, xgb.XGBClassifier): xgb_metric = "auc" sklearn_metric = "roc_auc" stratify = y elif isinstance(self.model_instance, xgb.XGBRegressor): xgb_metric = "mae" sklearn_metric = "neg_mean_absolute_error" stratify = None best_param = self._search_param(sklearn_metric, X, y) # update model with best param self.model_instance = self.model_class(**best_param) # Find optimal n_estimators based on original learning_rate # and early_stopping_rounds X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=seed, stratify=stratify) self.model_instance.fit( X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=xgb_metric, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=self.verbose) # Since test size is 20%, when retrain model to whole data, expect # n_estimator increased to 1/0.8 = 1.25 time. estimated_best_round = np.round(self.model_instance.best_ntree_limit * 1.25) self.model_instance.n_estimators = np.int64(estimated_best_round) self.model_instance.fit(X, y, eval_metric=xgb_metric, verbose=self.verbose)
Example #30
Source File: test_specified_index_splitter.py From deepchem with MIT License | 5 votes |
def test_split(self): ds = self.create_dataset() indexes = list(range(len(ds))) train, test = train_test_split(indexes) train, valid = train_test_split(train) splitter = SpecifiedIndexSplitter(train, valid, test) train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds) self.assertTrue(np.all(train_ds.X == ds.X[train])) self.assertTrue(np.all(valid_ds.X == ds.X[valid])) self.assertTrue(np.all(test_ds.X == ds.X[test]))