Python Examples of sklearn.cross_validation.StratifiedShuffleSplit

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)

Source File: car_eval.py From mHTM with MIT License

5 votes

def base_learners(data_path='data.csv', seed=123456789):
	"""
	Test some classifiers on the raw data.
	"""
	
	# Params
	nsplits = 8
	pct_train = 0.8
	
	# Get data
	data = pd.read_csv(data_path)
	x = data.ix[:, :-1].as_matrix()
	y = data.ix[:, -1].as_matrix()
	x, y = convert_data_to_int(x, y)
	
	# Run random forest in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		RandomForestClassifier(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Random Forest: {0:.3f} %'.format(np.median(results))
	
	# Run SVM in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		LinearSVC(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Linear SVM: {0:.3f} %'.format(np.median(results))

Source File: loader.py From mHTM with MIT License

5 votes

def _create_generator(self):
		"""
		Create a generator for the data. Yield a tuple containing the current
		training and testing split.
		"""
		
		# Create the CV iterators
		sss_tr = StratifiedShuffleSplit(self.tr_y, self.nsplits,
			train_size=self.train_size, random_state=self.seed)
		sss_te = StratifiedShuffleSplit(self.te_y, self.nsplits,
			train_size=self.test_size, random_state=self.seed)
		
		# Yield each item
		for tr, te in izip(sss_tr, sss_te):
			yield tr[0], te[0] + len(self.tr_y) # Offset testing indexes

Source File: utils.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License

5 votes

def stratified_split(x, y, test_size=0.2):
    strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
    train_index, valid_index = [s for s in strat_shuffled_split][0]

    x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]

    return x_train, y_train, x_valid, y_valid

Source File: data_dirs_organizer.py From painters with MIT License

5 votes

def _train_val_split_indices(labels):
    split = StratifiedShuffleSplit(
        labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
    indices_tr, indices_val = next(iter(split))

    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=False)
    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=True)
    return indices_tr, indices_val, split.classes

Source File: data.py From kaggle_diabetic with MIT License

5 votes

def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], 
                                                  test_size=test_size, 
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
    '''
    Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

    Could also be done with stratified shuffle split (+Append output) ?
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    '''
    # Construct a kfolds object
    # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
    kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # sample_weight=balance_weights(y_train)

        # Initialize a classifier with key word arguments
        clf = clf_class #(**kwargs)
        #sample_weight weighting not working here.. ?  TODO
        clf.fit(X_train,y_train) #,sample_weight) #
        if pred_proba == True:
            y_pred[test_index] = clf.predict_proba(X_test)
        else:
            y_pred[test_index] = clf.predict(X_test)
    return y_pred

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show()

Source File: OutPutRes.py From ProFET with GNU General Public License v3.0

5 votes

def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show()

Source File: OutPutRes.py From ProFET with GNU General Public License v3.0

5 votes

def CV_multi_stats(X, y, model,n=6) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    This version uses multiclass (or multilabel) compatible metrics.

    May be expanded to use the cross_val_score helper function:
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''

    scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy
    scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1')
    print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
    return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ?

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
                                         test_size=0.5, random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), [])

Source File: cross_validation.py From smappPy with GNU General Public License v2.0

4 votes

def grouped_stratified_train_test_split(y, x, group_by=None, test_size=0.33, group_labeler=None, return_indices=False, **kwargs):
    """
    Split arrays or matrices into random training and test subsets. Subsets will contain equal proportions of each label in `y`.
    Based on StratifiedShuffleSplit from sklearn.cross_validation.

    if `group_by` is an iterable of length `len(y)`, indices with the same `group_by[i]` will be kept together in either the training or the test set.

    if `group_labeler` is a callable, it will be used to assign a label to a group of labels. The default is `lambda labels: int(np.round(np.average(labels)))`
    

    --------
    Example:

     X = np.array([[1, 2], [3, 4], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1]])
     y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1])
     id = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

     x_train, x_test, y_train, y_test = grouped_stratified_train_test_split(y,X,id)

    """

    if not group_labeler:
        group_labeler = lambda labels: int(np.round(np.average(labels)))

    group_indices = dict()
    group_labels = dict()
    for i,(label, group) in enumerate(zip(y, group_by)):
        if not group in group_labels:
            group_labels[group] = list()
            group_indices[group] = list()
        group_indices[group].append(i)
        group_labels[group].append(label)
    groups, labels = zip(*{ group: group_labeler(labels) for group, labels in group_labels.items() }.items())

    sss = StratifiedShuffleSplit(labels, 1, test_size=test_size, **kwargs)

    group_train_indices, group_test_indices = list(sss)[0]
    test_groups = [groups[i] for i in group_test_indices]
    train_groups = [groups[j] for j in group_train_indices]

    test_indices = [idx for group in test_groups for idx in group_indices[group]]
    train_indices = [idx for group in train_groups for idx in group_indices[group]]
    if return_indices:
        return train_indices, test_indices
    else:
        return x[train_indices], x[test_indices], y[train_indices], y[test_indices]

Source File: helpers.py From scipy_2015_sklearn_tutorial with Creative Commons Zero v1.0 Universal

4 votes

def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size,
                                 random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss:
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels

Source File: OutPutRes.py From ProFET with GNU General Public License v3.0

4 votes

def CV_Binary_stats(X, y, model,n=10) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    Note that some of the metrics here ONLY work for BINARY tasks.
    This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
     used with a classifier that is parallelized anyway, such as RF).
    By default, balances weights when fitting

    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''
    from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support

    mean_auc = 0.0
    mean_precision = 0.0
    mean_recall = 0.0
    mean_accuracy = 0.0

    sss = StratifiedShuffleSplit(y,  n_iter=n, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # for i in range(n) :
    #     # for each iteration, randomly hold out 30% of the data as CV set
    #     X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
    #                                                                      test_size=.15,
    #                                                                      random_state=i)
    #     cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
        # train model and make predictions
        model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
        # preds = model.predict(X_cv)
        preds = model.predict(X_test)

        '''
        # ROC_AUC - Restricted to binary (not multiclass) case.
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        # print("( %d/%d)" % (i + 1, n))
        mean_auc += roc_auc
        '''
        accuracy = accuracy_score(y_cv, preds)
        precision = precision_score(y_cv, preds)
        recall = recall_score(y_cv, preds)
        mean_accuracy += accuracy
        mean_precision += precision
        mean_recall += recall

    mean_accuracy = (mean_accuracy / n)
    mean_precision = mean_precision / n
    mean_recall = mean_recall / n
    # mean_auc = mean_auc / n
    print('mean_accuracy:  %s ' %(round(mean_accuracy, 3)))
    print('mean_precision:  %s ' %(round(mean_precision, 3)))
    print('mean_recall:  %s ' %(round(mean_recall, 3)))
    # print('mean_auc:  %s ' %(round(mean_auc, 3)))
    return (mean_accuracy,mean_precision,mean_recall)

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

4 votes

def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)

Python sklearn.cross_validation.StratifiedShuffleSplit() Examples