Python sklearn.model_selection.TimeSeriesSplit() Examples
The following are 18
code examples of sklearn.model_selection.TimeSeriesSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: listing_9_3_backtest.py From fight-churn with MIT License | 8 votes |
def backtest(data_set_path,n_test_split): X,y = prepare_data(data_set_path,as_retention=False) tscv = TimeSeriesSplit(n_splits=n_test_split) lift_scorer = make_scorer(calc_lift, needs_proba=True) score_models = {'lift': lift_scorer, 'AUC': 'roc_auc'} retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True) gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1, return_train_score=False, param_grid={'C' : [1]}, refit='AUC') gsearch.fit(X,y) result_df = pd.DataFrame(gsearch.cv_results_) save_path = data_set_path.replace('.csv', '_backtest.csv') result_df.to_csv(save_path, index=False) print('Saved test scores to ' + save_path)
Example #2
Source File: test_anomaly_detectors.py From gordo with GNU Affero General Public License v3.0 | 6 votes |
def test_diff_detector_cross_validate(return_estimator: bool): """ DiffBasedAnomalyDetector.cross_validate implementation should be the same as sklearn.model_selection.cross_validate if called the same. And it always will update `return_estimator` to True, as it requires the intermediate models to calculate the thresholds """ X = np.random.random((100, 10)) y = np.random.random((100, 1)) model = DiffBasedAnomalyDetector(base_estimator=LinearRegression()) cv = TimeSeriesSplit(n_splits=3) cv_results_da = model.cross_validate( X=X, y=y, cv=cv, return_estimator=return_estimator ) cv_results_sk = cross_validate(model, X=X, y=y, cv=cv, return_estimator=True) assert cv_results_da.keys() == cv_results_sk.keys()
Example #3
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #4
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #5
Source File: listing_9_6_crossvalidate_xgb.py From fight-churn with MIT License | 6 votes |
def crossvalidate_xgb(data_set_path,n_test_split): X,y = prepare_data(data_set_path,ext='',as_retention=False) tscv = TimeSeriesSplit(n_splits=n_test_split) score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'} xgb_model = xgb.XGBClassifier(objective='binary:logistic') test_params = { 'max_depth': [1,2,4,6], 'learning_rate': [0.1,0.2,0.3,0.4], 'n_estimators': [20,40,80,120], 'min_child_weight' : [3,6,9,12]} gsearch = GridSearchCV(estimator=xgb_model,n_jobs=-1, scoring=score_models, cv=tscv, verbose=1, return_train_score=False, param_grid=test_params,refit='AUC') gsearch.fit(X.values,y) result_df = pd.DataFrame(gsearch.cv_results_) result_df.sort_values('mean_test_AUC',ascending=False,inplace=True) save_path = data_set_path.replace('.csv', '_crossval_xgb.csv') result_df.to_csv(save_path, index=False) print('Saved test scores to ' + save_path) pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl') with open(pickle_path, 'wb') as fid: pickle.dump(gsearch.best_estimator_, fid) print('Saved model pickle to ' + pickle_path) predictions = gsearch.best_estimator_.predict_proba(X.values) predict_df = pd.DataFrame(predictions, index=X.index, columns=['retain_prob','churn_prob']) forecast_save_path = data_set_path.replace('.csv', '_xgb_predictions.csv') print('Saving results to %s' % forecast_save_path) predict_df.to_csv(forecast_save_path, header=True) forecast_histogram(data_set_path,predict_df,ext='xgb')
Example #6
Source File: churn_calc.py From fight-churn with MIT License | 6 votes |
def prepare_xy(self,groups=True): if groups: self.apply_behavior_grouping() dat= pd.DataFrame(self.churn_data_reduced) cols=self.grouped_columns else: self.normalize_skewscale() dat = pd.DataFrame(self.data_scores) cols = self.metric_columns # The result has to be sorted by date for the TimeSeriesSplit to work properly dat['temp_obs_date'] = self.observe_dates.values dat.sort_values('temp_obs_date',inplace=True) X = dat[cols] y = dat['is_churn'] return X,y
Example #7
Source File: churn_calc.py From fight-churn with MIT License | 6 votes |
def crossvalidate_churn_model(self,model_code,groups=True): X,y = self.prepare_xy(groups) params = self.cv_params(model_code) model = self.model_instance(model_code) tscv = TimeSeriesSplit(n_splits=3) lift_scorer = make_scorer(top_decile_lift,needs_proba=True) score_models = {'lift_scorer' : lift_scorer, 'AUC' : 'roc_auc'} gsearch = GridSearchCV(estimator=model, param_grid=params, scoring=score_models, cv=tscv, n_jobs=8,verbose=5, return_train_score=True,refit='AUC') gsearch.fit(X, y) result_df = pd.DataFrame(gsearch.cv_results_) if len(params)>1: result_df.sort_values('mean_test_AUC',ascending=False,inplace=True) save_file_name = model_code + '_CV' save_path = self.save_path(save_file_name, subdir=self.grouping_correlation_subdir(groups)) result_df.to_csv(save_path) print('Saved result to ' + save_path) return result_df
Example #8
Source File: tssplitblock.py From Persimmon with MIT License | 5 votes |
def function(self): self.out_1.val = TimeSeriesSplit()
Example #9
Source File: test_model.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def test_keras_autoencoder_crossval(model, kind): """ Test ability for cross validation """ Model = pydoc.locate(f"gordo.machine.model.models.{model}") model = Pipeline([("model", Model(kind=kind))]) X = np.random.random(size=(15, 2)) y = X.copy() scores = cross_val_score( model, X, y, cv=TimeSeriesSplit(n_splits=2, max_train_size=2) ) assert isinstance(scores, np.ndarray) logger.info(f"Mean score: {scores.mean():.4f} - Std score: {scores.std():.4f}")
Example #10
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_time_series_max_train_size(): X = np.zeros((6, 1)) splits = TimeSeriesSplit(n_splits=3).split(X) check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=3) # Test for the case where the size of a fold is greater than max_train_size check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=2) # Test for the case where the size of each fold is less than max_train_size check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
Example #11
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
Example #12
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #13
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #14
Source File: listing_9_5_crossvalidate.py From fight-churn with MIT License | 5 votes |
def crossvalidate(data_set_path,n_test_split): X,y = prepare_data(data_set_path,as_retention=False) tscv = TimeSeriesSplit(n_splits=n_test_split) score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'} retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True) test_params = {'C' : [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005, 0.0025]} gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1, return_train_score=False, param_grid=test_params, refit=False) gsearch.fit(X,y) result_df = pd.DataFrame(gsearch.cv_results_) result_df['n_weights']= test_n_weights(X,y,test_params) result_df.to_csv(data_set_path.replace('.csv', '_crossval.csv'), index=False) plot_regression_test(data_set_path,result_df)
Example #15
Source File: time_series_split.py From timeserio with MIT License | 5 votes |
def split(self, df, y=None, groups=None): self._validate_df(df) groups = df.groupby(self.groupby).indices splits = {} while True: X_idxs, y_idxs = [], [] for key, sub_idx in groups.items(): sub_df = df.iloc[sub_idx] sub_y = y[sub_idx] if y is not None else None if key not in splits: splitter = TimeSeriesSplit( self.n_splits, self.max_train_size ) splits[key] = splitter.split(sub_df, sub_y) try: X_idx, y_idx = next(splits[key]) X_idx = np.array( [df.index.get_loc(i) for i in sub_df.iloc[X_idx].index] ) y_idx = np.array( [df.index.get_loc(i) for i in sub_df.iloc[y_idx].index] ) X_idxs.append(X_idx) y_idxs.append(y_idx) except StopIteration: pass if len(X_idxs) == 0: break yield np.concatenate(X_idxs), np.concatenate(y_idxs)
Example #16
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_nsplit_default_warn(): # Test that warnings are raised. Will be removed in 0.22 assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit) assert_no_warnings(KFold, n_splits=5) assert_no_warnings(GroupKFold, n_splits=5) assert_no_warnings(StratifiedKFold, n_splits=5) assert_no_warnings(TimeSeriesSplit, n_splits=5)
Example #17
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_time_series_max_train_size(): X = np.zeros((6, 1)) splits = TimeSeriesSplit(n_splits=3).split(X) check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=3) # Test for the case where the size of a fold is greater than max_train_size check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=2) # Test for the case where the size of each fold is less than max_train_size check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X) _check_time_series_max_train_size(splits, check_splits, max_train_size=2)
Example #18
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)