Python sklearn.cross_validation.StratifiedShuffleSplit() Examples

The following are 16 code examples of sklearn.cross_validation.StratifiedShuffleSplit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function .
Example #1
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) 
Example #2
Source File: car_eval.py    From mHTM with MIT License 5 votes vote down vote up
def base_learners(data_path='data.csv', seed=123456789):
	"""
	Test some classifiers on the raw data.
	"""
	
	# Params
	nsplits = 8
	pct_train = 0.8
	
	# Get data
	data = pd.read_csv(data_path)
	x = data.ix[:, :-1].as_matrix()
	y = data.ix[:, -1].as_matrix()
	x, y = convert_data_to_int(x, y)
	
	# Run random forest in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		RandomForestClassifier(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Random Forest: {0:.3f} %'.format(np.median(results))
	
	# Run SVM in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		LinearSVC(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Linear SVM: {0:.3f} %'.format(np.median(results)) 
Example #3
Source File: loader.py    From mHTM with MIT License 5 votes vote down vote up
def _create_generator(self):
		"""
		Create a generator for the data. Yield a tuple containing the current
		training and testing split.
		"""
		
		# Create the CV iterators
		sss_tr = StratifiedShuffleSplit(self.tr_y, self.nsplits,
			train_size=self.train_size, random_state=self.seed)
		sss_te = StratifiedShuffleSplit(self.te_y, self.nsplits,
			train_size=self.test_size, random_state=self.seed)
		
		# Yield each item
		for tr, te in izip(sss_tr, sss_te):
			yield tr[0], te[0] + len(self.tr_y) # Offset testing indexes 
Example #4
Source File: utils.py    From kaggle_otto with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def stratified_split(x, y, test_size=0.2):
    strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
    train_index, valid_index = [s for s in strat_shuffled_split][0]

    x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]

    return x_train, y_train, x_valid, y_valid 
Example #5
Source File: data_dirs_organizer.py    From painters with MIT License 5 votes vote down vote up
def _train_val_split_indices(labels):
    split = StratifiedShuffleSplit(
        labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
    indices_tr, indices_val = next(iter(split))

    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=False)
    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=True)
    return indices_tr, indices_val, split.classes 
Example #6
Source File: data.py    From kaggle_diabetic with MIT License 5 votes vote down vote up
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], 
                                                  test_size=test_size, 
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te 
Example #7
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
    '''
    Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

    Could also be done with stratified shuffle split (+Append output) ?
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    '''
    # Construct a kfolds object
    # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
    kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # sample_weight=balance_weights(y_train)

        # Initialize a classifier with key word arguments
        clf = clf_class #(**kwargs)
        #sample_weight weighting not working here.. ?  TODO
        clf.fit(X_train,y_train) #,sample_weight) #
        if pred_proba == True:
            y_pred[test_index] = clf.predict_proba(X_test)
        else:
            y_pred[test_index] = clf.predict(X_test)
    return y_pred 
Example #8
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show() 
Example #9
Source File: OutPutRes.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show() 
Example #10
Source File: OutPutRes.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def CV_multi_stats(X, y, model,n=6) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    This version uses multiclass (or multilabel) compatible metrics.

    May be expanded to use the cross_val_score helper function:
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''

    scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy
    scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1')
    print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
    return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ? 
Example #11
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
Example #12
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
                                         test_size=0.5, random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), []) 
Example #13
Source File: cross_validation.py    From smappPy with GNU General Public License v2.0 4 votes vote down vote up
def grouped_stratified_train_test_split(y, x, group_by=None, test_size=0.33, group_labeler=None, return_indices=False, **kwargs):
    """
    Split arrays or matrices into random training and test subsets. Subsets will contain equal proportions of each label in `y`.
    Based on StratifiedShuffleSplit from sklearn.cross_validation.

    if `group_by` is an iterable of length `len(y)`, indices with the same `group_by[i]` will be kept together in either the training or the test set.

    if `group_labeler` is a callable, it will be used to assign a label to a group of labels. The default is `lambda labels: int(np.round(np.average(labels)))`
    

    --------
    Example:

     X = np.array([[1, 2], [3, 4], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1]])
     y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1])
     id = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

     x_train, x_test, y_train, y_test = grouped_stratified_train_test_split(y,X,id)

    """

    if not group_labeler:
        group_labeler = lambda labels: int(np.round(np.average(labels)))

    group_indices = dict()
    group_labels = dict()
    for i,(label, group) in enumerate(zip(y, group_by)):
        if not group in group_labels:
            group_labels[group] = list()
            group_indices[group] = list()
        group_indices[group].append(i)
        group_labels[group].append(label)
    groups, labels = zip(*{ group: group_labeler(labels) for group, labels in group_labels.items() }.items())

    sss = StratifiedShuffleSplit(labels, 1, test_size=test_size, **kwargs)

    group_train_indices, group_test_indices = list(sss)[0]
    test_groups = [groups[i] for i in group_test_indices]
    train_groups = [groups[j] for j in group_train_indices]

    test_indices = [idx for group in test_groups for idx in group_indices[group]]
    train_indices = [idx for group in train_groups for idx in group_indices[group]]
    if return_indices:
        return train_indices, test_indices
    else:
        return x[train_indices], x[test_indices], y[train_indices], y[test_indices] 
Example #14
Source File: helpers.py    From scipy_2015_sklearn_tutorial with Creative Commons Zero v1.0 Universal 4 votes vote down vote up
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size,
                                 random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss:
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels 
Example #15
Source File: OutPutRes.py    From ProFET with GNU General Public License v3.0 4 votes vote down vote up
def CV_Binary_stats(X, y, model,n=10) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    Note that some of the metrics here ONLY work for BINARY tasks.
    This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
     used with a classifier that is parallelized anyway, such as RF).
    By default, balances weights when fitting

    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''
    from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support

    mean_auc = 0.0
    mean_precision = 0.0
    mean_recall = 0.0
    mean_accuracy = 0.0

    sss = StratifiedShuffleSplit(y,  n_iter=n, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # for i in range(n) :
    #     # for each iteration, randomly hold out 30% of the data as CV set
    #     X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
    #                                                                      test_size=.15,
    #                                                                      random_state=i)
    #     cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
        # train model and make predictions
        model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
        # preds = model.predict(X_cv)
        preds = model.predict(X_test)

        '''
        # ROC_AUC - Restricted to binary (not multiclass) case.
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        # print("( %d/%d)" % (i + 1, n))
        mean_auc += roc_auc
        '''
        accuracy = accuracy_score(y_cv, preds)
        precision = precision_score(y_cv, preds)
        recall = recall_score(y_cv, preds)
        mean_accuracy += accuracy
        mean_precision += precision
        mean_recall += recall

    mean_accuracy = (mean_accuracy / n)
    mean_precision = mean_precision / n
    mean_recall = mean_recall / n
    # mean_auc = mean_auc / n
    print('mean_accuracy:  %s ' %(round(mean_accuracy, 3)))
    print('mean_precision:  %s ' %(round(mean_precision, 3)))
    print('mean_recall:  %s ' %(round(mean_recall, 3)))
    # print('mean_auc:  %s ' %(round(mean_auc, 3)))
    return (mean_accuracy,mean_precision,mean_recall) 
Example #16
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)