Python sklearn.utils.resample() Examples
The following are 24
code examples of sklearn.utils.resample().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.
Example #1
Source File: stats_utils.py From CDSS with GNU General Public License v3.0 | 6 votes |
def bootstrap_CI(actual_list, predict_list, num_repeats=1000, stat='roc_auc', confident_lvl=0.95, side='two', random_state=0): assert len(actual_list) == len(predict_list) from sklearn.utils import resample try: all_stats = [] for i in range(num_repeats): actual_list_resampled, predict_list_resampled = resample(actual_list, predict_list) if stat == 'roc_auc': cur_roc_auc = roc_auc_score(actual_list_resampled, predict_list_resampled) all_stats.append(cur_roc_auc) roc_auc_left = np.percentile(all_stats, (1 - confident_lvl) / 2. * 100) roc_auc_right = np.percentile(all_stats, (1 + confident_lvl) / 2. * 100) except Exception as e: # print e roc_auc_left, roc_auc_right = float('nan'), float('nan') return roc_auc_left, roc_auc_right
Example #2
Source File: demos.py From bayesian_bootstrap with MIT License | 6 votes |
def plot_mean_bootstrap_exponential_readme(): X = np.random.exponential(7, 4) classical_samples = [np.mean(resample(X)) for _ in range(10000)] posterior_samples = mean(X, 10000) l, r = highest_density_interval(posterior_samples) classical_l, classical_r = highest_density_interval(classical_samples) plt.subplot(2, 1, 1) plt.title('Bayesian Bootstrap of mean') sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples') plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI') plt.xlim(-1, 18) plt.legend() plt.subplot(2, 1, 2) plt.title('Classical Bootstrap of mean') sns.distplot(classical_samples, label='Classical Bootstrap Samples') plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker='o', label='95% HDI') plt.xlim(-1, 18) plt.legend() plt.savefig('readme_exponential.png', bbox_inches='tight')
Example #3
Source File: demos.py From bayesian_bootstrap with MIT License | 6 votes |
def plot_mean_bootstrap(): X = [-1, 0, 1] posterior_samples = mean(X, 10000) sns.distplot(posterior_samples) classical_samples = [np.mean(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
Example #4
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_resample_stratified_replace(): # Make sure stratified resampling supports the replace parameter rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=n_samples) X_replace, _ = resample(X, y, replace=True, n_samples=50, random_state=rng, stratify=y) X_no_replace, _ = resample(X, y, replace=False, n_samples=50, random_state=rng, stratify=y) assert np.unique(X_replace).shape[0] < 50 assert np.unique(X_no_replace).shape[0] == 50 # make sure n_samples can be greater than X.shape[0] if we sample with # replacement X_replace, _ = resample(X, y, replace=True, n_samples=1000, random_state=rng, stratify=y) assert X_replace.shape[0] == 1000 assert np.unique(X_replace).shape[0] == 100
Example #5
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_resample_stratify_sparse_error(): # resample must be ndarray rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 2)) y = rng.randint(0, 2, size=n_samples) stratify = sp.csr_matrix(y) with pytest.raises(TypeError, match='A sparse matrix was passed'): X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
Example #6
Source File: test_utils.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_resample(): # Border case not worth mentioning in doctests assert_true(resample() is None) # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)
Example #7
Source File: sample.py From Benchmarks with MIT License | 5 votes |
def sample(dataset, num_samples, replace=True): """ Sample the dataset """ data_idx = dummy_indices(dataset) sample_idx = resample(data_idx, n_samples=num_samples, replace=replace) return Subset(dataset, sample_idx)
Example #8
Source File: rotation_forest.py From RotationForest with MIT License | 5 votes |
def _fit_rotation_matrix(self, X): random_state = check_random_state(self.random_state) n_samples, n_features = X.shape self.rotation_matrix = np.zeros((n_features, n_features), dtype=np.float32) for i, subset in enumerate( random_feature_subsets(X, self.n_features_per_subset, random_state=self.random_state)): # take a 75% bootstrap from the rows x_sample = resample(X, n_samples=int(n_samples*0.75), random_state=10*i) pca = self.pca_algorithm() pca.fit(x_sample[:, subset]) self.rotation_matrix[np.ix_(subset, subset)] = pca.components_
Example #9
Source File: misc.py From lumin with Apache License 2.0 | 5 votes |
def subsample_df(df:pd.DataFrame, objective:str, targ_name:str, n_samples:Optional[int]=None, replace:bool=False, strat_key:Optional[str]=None, wgt_name:Optional[str]=None) -> pd.DataFrame: r''' Subsamples, or samples with replacement, a DataFrame. Will automatically reweight data such that weight sums remain the same as the original DataFrame (per class) Arguments: df: DataFrame to sample objective: string representation of objective: either 'classification' or 'regression' targ_name: name of column containing target data n_samples: If set, will sample that number of data points, otherwise will sample with replacement a new DataFRame of the same size as the original replace: whether to sample with replacement strat_key: column name to use for stratified subsampling, if desired wgt_name: name of column containing weight data. If set, will reweight subsampled data, otherwise will not ''' tmp_df = df.loc[resample(df.index, replace=replace, n_samples=n_samples, stratify=None if strat_key is None else df[strat_key])] # Reweight resampled data if wgt_name is not None: if 'class' in objective.lower(): for c in tmp_df[targ_name].unique(): tmp_df.loc[tmp_df[targ_name] == c, wgt_name] *= df.loc[df[targ_name] == c, wgt_name].sum() / tmp_df.loc[tmp_df[targ_name] == c, wgt_name].sum() else: tmp_df[wgt_name] *= df[wgt_name].sum() / tmp_df[wgt_name].sum() return tmp_df
Example #10
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_resample_stratify_2dy(): # Make sure y can be 2d when stratifying rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=(n_samples, 2)) X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y) assert y.ndim == 2
Example #11
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_resample_stratified(): # Make sure resample can stratify rng = np.random.RandomState(0) n_samples = 100 p = .9 X = rng.normal(size=(n_samples, 1)) y = rng.binomial(1, p, size=n_samples) _, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None) assert np.all(y_not_stratified == 1) _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y) assert not np.all(y_stratified == 1) assert np.sum(y_stratified) == 9 # all 1s, one 0
Example #12
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_resample(): # Border case not worth mentioning in doctests assert resample() is None # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)
Example #13
Source File: core.py From ffn with MIT License | 5 votes |
def resample_returns( returns, func, seed=0, num_trials=100 ): """ Resample the returns and calculate any statistic on every new sample. https://en.wikipedia.org/wiki/Resampling_(statistics) :param returns (Series, DataFrame): Returns :param func: Given the resampled returns calculate a statistic :param seed: Seed for random number generator :param num_trials: Number of times to resample and run the experiment :return: Series of resampled statistics """ # stats = [] if type(returns) is pd.Series: stats = pd.Series(index=range(num_trials)) elif type(returns) is pd.DataFrame: stats = pd.DataFrame( index=range(num_trials), columns=returns.columns ) else: raise(TypeError("returns needs to be a Series or DataFrame!")) n = returns.shape[0] for i in range(num_trials): random_indices = resample(returns.index, n_samples=n, random_state=seed + i) stats.loc[i] = func(returns.loc[random_indices]) return stats
Example #14
Source File: demos.py From bayesian_bootstrap with MIT License | 5 votes |
def plot_regression_bootstrap(): X = np.array([[0], [1], [2], [3]]) y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4) classical_samples = [LinearRegression().fit(*resample(X, y)).coef_ for _ in tqdm(range(10000))] posterior_samples = bayesian_bootstrap_regression(X, y, lambda X, y: LinearRegression().fit(X, y).coef_, 10000, 1000) plt.scatter(X.reshape(-1, 1), y) plt.show() sns.distplot(classical_samples) sns.distplot(posterior_samples) plt.show()
Example #15
Source File: demos.py From bayesian_bootstrap with MIT License | 5 votes |
def plot_var_resample_bootstrap(): X = np.random.uniform(-1, 1, 100) posterior_samples = bayesian_bootstrap(X, np.var, 10000, 500) sns.distplot(posterior_samples) classical_samples = [np.var(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
Example #16
Source File: demos.py From bayesian_bootstrap with MIT License | 5 votes |
def plot_var_bootstrap(): X = np.random.uniform(-1, 1, 100) posterior_samples = var(X, 10000) sns.distplot(posterior_samples) classical_samples = [np.var(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
Example #17
Source File: demos.py From bayesian_bootstrap with MIT License | 5 votes |
def plot_median(): X = np.random.uniform(-1, 1, 10) posterior_samples = bayesian_bootstrap(X, np.median, 10000, 100) sns.distplot(posterior_samples) classical_samples = [np.median(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
Example #18
Source File: demos.py From bayesian_bootstrap with MIT License | 5 votes |
def plot_mean_resample_bootstrap(): X = [-1, 0, 1] posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100) sns.distplot(posterior_samples) classical_samples = [np.mean(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
Example #19
Source File: evaluate_los.py From mimic3-benchmarks with MIT License | 4 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default=os.path.join(os.path.dirname(__file__), '../../data/length-of-stay/test/listfile.csv')) parser.add_argument('--n_iters', type=int, default=1000) parser.add_argument('--save_file', type=str, default='los_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32, 'y_true': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32, 'y_true': np.float32}) df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'], how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('Kappa', 'kappa'), ('MAD', 'mad'), ('MSE', 'mse'), ('MAPE', 'mape')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print("Saving the results in {} ...".format(args.save_file)) with open(args.save_file, 'w') as f: json.dump(results, f) print(results)
Example #20
Source File: evaluate_ihm.py From mimic3-benchmarks with MIT License | 4 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default=os.path.join(os.path.dirname(__file__), '../../data/in-hospital-mortality/test/listfile.csv')) parser.add_argument('--n_iters', type=int, default=10000) parser.add_argument('--save_file', type=str, default='ihm_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False) test_df = pd.read_csv(args.test_listfile, index_col=False) df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('AUC of ROC', 'auroc'), ('AUC of PRC', 'auprc'), ('min(+P, Se)', 'minpse')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print("Saving the results in {} ...".format(args.save_file)) with open(args.save_file, 'w') as f: json.dump(results, f) print(results)
Example #21
Source File: evaluate_decomp.py From mimic3-benchmarks with MIT License | 4 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default=os.path.join(os.path.dirname(__file__), '../../data/decompensation/test/listfile.csv')) parser.add_argument('--n_iters', type=int, default=1000) parser.add_argument('--save_file', type=str, default='decomp_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32}) df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'], how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('AUC of ROC', 'auroc'), ('AUC of PRC', 'auprc'), ('min(+P, Se)', 'minpse')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print("Saving the results in {} ...".format(args.save_file)) with open(args.save_file, 'w') as f: json.dump(results, f) print(results)
Example #22
Source File: causal_estimator.py From dowhy with MIT License | 4 votes |
def _generate_bootstrap_estimates(self, num_bootstrap_simulations, sample_size_fraction): """ Helper function to generate causal estimates over bootstrapped samples. :param num_bootstrap_simulations: Number of simulations for the bootstrap method. :param sample_size_fraction: Fraction of the dataset to be resampled. :returns: A collections.namedtuple containing a list of bootstrapped estimates and a dictionary containing parameters used for the bootstrap. """ # The array that stores the results of all estimations simulation_results = np.zeros(num_bootstrap_simulations) # Find the sample size the proportion with the population size sample_size= int( sample_size_fraction * len(self._data) ) if sample_size > len(self._data): self.logger.warning("WARN: The sample size is greater than the data being sampled") self.logger.info("INFO: The sample size: {}".format(sample_size) ) self.logger.info("INFO: The number of simulations: {}".format(num_bootstrap_simulations) ) # Perform the set number of simulations for index in range(num_bootstrap_simulations): new_data = resample(self._data,n_samples=sample_size) new_estimator = type(self)( new_data, self._target_estimand, self._target_estimand.treatment_variable, self._target_estimand.outcome_variable, #names of treatment and outcome test_significance=False, evaluate_effect_strength=False, confidence_intervals = False, target_units = self._target_units, effect_modifiers = self._effect_modifier_names, params = self.method_params ) new_effect = new_estimator.estimate_effect() simulation_results[index] = new_effect.value estimates = CausalEstimator.BootstrapEstimates(simulation_results, {'num_simulations': num_bootstrap_simulations, 'sample_size_fraction': sample_size_fraction}) return estimates
Example #23
Source File: bootstrap_refuter.py From dowhy with MIT License | 4 votes |
def refute_estimate(self, *args, **kwargs): if self._sample_size > len(self._data): self.logger.warning("The sample size is larger than the population size") sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._num_simulations ,self._sample_size ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = resample(self._data, n_samples=self._sample_size ) else: new_data = resample(self._data, n_samples=self._sample_size, random_state=self._random_state ) if self._chosen_variables is not None: for variable in self._chosen_variables: if ('float' or 'int') in new_data[variable].dtype.name: scaling_factor = new_data[variable].std() new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) elif 'bool' in new_data[variable].dtype.name: probs = np.random.uniform(0, 1, self._sample_size ) new_data[variable] = np.where(probs < self._probability_of_change, np.logical_not(new_data[variable]), new_data[variable]) elif 'category' in new_data[variable].dtype.name: categories = new_data[variable].unique() # Find the set difference for each row changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) ) # Choose one out of the remaining changed_data = changed_data.apply( lambda row: random.choice(row) ) new_data[variable] = np.where(probs < self._probability_of_change, changed_data) new_data[variable].astype('category') new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Bootstrap Sample Dataset" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as running bootstrap should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) refute.add_refuter(self) return refute
Example #24
Source File: dataset_wrapper.py From interpret-community with MIT License | 4 votes |
def sample(self, max_dim_clustering=Defaults.MAX_DIM, sampling_method=Defaults.HDBSCAN): """Sample the examples. First does random downsampling to upper_bound rows, then tries to find the optimal downsample based on how many clusters can be constructed from the data. If sampling_method is hdbscan, uses hdbscan to cluster the data and then downsamples to that number of clusters. If sampling_method is k-means, uses different values of k, cutting in half each time, and chooses the k with highest silhouette score to determine how much to downsample the data. The danger of using only random downsampling is that we might downsample too much or too little, so the clustering approach is a heuristic to give us some idea of how much we should downsample to. :param max_dim_clustering: Dimensionality threshold for performing reduction. :type max_dim_clustering: int :param sampling_method: Method to use for sampling, can be 'hdbscan' or 'kmeans'. :type sampling_method: str """ from sklearn.utils import resample # bounds are rough estimates that came from manual investigation lower_bound = 200 upper_bound = 10000 num_rows = self._dataset.shape[0] module_logger.info('sampling examples') # If less than lower_bound rows, just return the full dataset if num_rows < lower_bound: return self._dataset # If more than upper_bound rows, sample randomly elif num_rows > upper_bound: module_logger.info('randomly sampling to 10k rows') self._dataset = resample(self._dataset, n_samples=upper_bound, random_state=7) num_rows = upper_bound if sampling_method == Defaults.HDBSCAN: try: opt_k = self._find_k_hdbscan(max_dim_clustering) except Exception as ex: module_logger.warning(('Failed to use hdbscan due to error: {}' '\nEnsure hdbscan is installed with: pip install hdbscan').format(str(ex))) opt_k = self._find_k_kmeans(max_dim_clustering) else: opt_k = self._find_k_kmeans(max_dim_clustering) # Resample based on optimal number of clusters if (opt_k < num_rows): self._dataset = resample(self._dataset, n_samples=opt_k, random_state=7) return self._dataset