Python scipy.stats.ks_2samp() Examples

The following are 30 code examples of scipy.stats.ks_2samp(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.stats , or try the search function .
Example #1
Source File: test_mvknn.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_conditional_indicator(knn_xz):
    # Simulate from the conditional distribution of x|z (see
    # generate_real_nominal_data) and perfrom a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(knn_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of X Given Indicator Z')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            knn_xz.simulate(-1, [0], constraints={1:t}, N=len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .1 < pvalue
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid() 
Example #2
Source File: compare_genomes.py    From mCaller with MIT License 6 votes vote down vote up
def compare_by_position(bed1,bed2,xmfa):
    pos_dict = {}

    for i,bed in enumerate([bed1,bed2]):
        pos_dict[i] = {}
        with open(bed,'r') as fi:
                for line in fi:
                #2  1892198 1892199 TCMMTMTTMMM 0.5 -   16
                    csome,start,end,motif,perc_meth,strand,num_reads,probabilities = tuple(line.split('\t'))
                    pos_dict[i][(csome,start,end,strand)] = ((perc_meth,num_reads),np.asarray([float(p) for p in probabilities.strip().split(',')]))

    for pos in pos_dict[0]:
        if pos in pos_dict[1]:
            try:
                u,pval = mannwhitneyu(pos_dict[0][pos][1],pos_dict[0][pos][1],alternative='two-sided')
            except ValueError:
                u,pval = 'none','identical'
            u2,pval2 = ranksums(pos_dict[0][pos][1],pos_dict[0][pos][1])
            try:
                t,pval3 = ttest_ind(pos_dict[0][pos][1],pos_dict[0][pos][1])
            except:
                t,pval3 = 'none','missing df'
            d,pval4 = ks_2samp(pos_dict[0][pos][1],pos_dict[0][pos][1])
            if pval4 < 0.9:
                print pos, pos_dict[0][pos][0], pos_dict[1][pos][0], pval, pval2, pval3, pval4 
Example #3
Source File: test_weighted_statistics.py    From pyABC with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_resample_deterministic():
    """
    Test the deterministic resampling routine.
    """
    nw = 50  # number of weighed points
    points = np.random.randn(nw)
    weights = np.random.rand(nw)
    weights /= np.sum(weights)

    n = 1000  # number of non-weighted points
    resampled_det = ws.resample_deterministic(points, weights, n, False)

    resampled = ws.resample(points, weights, n)

    # should be same distribution
    _, p = ks_2samp(resampled_det, resampled)
    assert p > 1e-2

    resampled_det2 = ws.resample_deterministic(points, weights, n, True)
    assert len(resampled_det2) == n

    _, p = ks_2samp(resampled_det2, resampled)
    assert p > 1e-2 
Example #4
Source File: test_weighted_statistics.py    From pyABC with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_resample():
    """
    Test that the resampling process yields consistent distributions,
    using a KS test.
    """
    nw = 50  # number of weighted points
    points = np.random.randn(nw)
    weights = np.random.rand(nw)
    weights /= np.sum(weights)

    n = 1000  # number of non-weighted points
    # sample twice from same samples
    resampled1 = ws.resample(points, weights, n)
    resampled2 = ws.resample(points, weights, n)

    # should be same distribution
    _, p = ks_2samp(resampled1, resampled2)
    assert p > 1e-2

    # use different points
    points3 = np.random.randn(nw)
    resampled3 = ws.resample(points3, weights, n)
    # should be different distributions
    _, p = ks_2samp(resampled1, resampled3)
    assert p < 1e-2 
Example #5
Source File: stat_utils.py    From causallib with Apache License 2.0 6 votes vote down vote up
def calc_weighted_ks2samp(x, y, wx, wy):
    """
    Weighted Kolmogorov-Smirnov

    References:
        [1] https://stackoverflow.com/a/40059727
    """
    x_ix = np.argsort(x)
    y_ix = np.argsort(y)
    x, wx = x[x_ix], wx[x_ix]
    y, wy = y[y_ix], wy[y_ix]
    data = np.concatenate((x, y))
    wx_cum = np.hstack([0, wx.cumsum() / wx.sum()])
    wy_cum = np.hstack([0, wy.cumsum() / wy.sum()])
    # Align the "steps" between the two distribution so the differences will be well defined:
    x_align = wx_cum[[np.searchsorted(x, data, side="right")]]
    y_align = wy_cum[[np.searchsorted(y, data, side="right")]]
    stat = np.max(np.abs(x_align - y_align))
    # stat = ks_2samp(wx * x, wy * y)
    return stat 
Example #6
Source File: feature_selection.py    From default-credit-card-prediction with MIT License 6 votes vote down vote up
def kolmogorov_smirnov_two_sample_test(X,y):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter feature values of each class are drawn from identical distributions

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""

	kolmogorov_smirnov=[[(0,0)]]*len(X[0])
	# print kolmogorov_smirnov
	for feature_col in xrange(len(X[0])):
			ks_test_statistic,p_value=stats.ks_2samp(X[y==0,feature_col],X[y==1,feature_col])
			kolmogorov_smirnov[feature_col]=(ks_test_statistic,p_value)

	#debug
	for f in xrange(23):
		print kolmogorov_smirnov[f]

	return kolmogorov_smirnov 
Example #7
Source File: ks.py    From alibi-detect with Apache License 2.0 6 votes vote down vote up
def feature_score(self, X_ref: np.ndarray, X: np.ndarray) -> np.ndarray:
        """
        Compute K-S scores per feature.

        Parameters
        ----------
        X_ref
            Reference instances to compare distribution with.
        X
            Batch of instances.

        Returns
        -------
        Feature level drift scores.
        """
        X = X.reshape(X.shape[0], -1)
        X_ref = X_ref.reshape(X_ref.shape[0], -1)
        p_val = np.zeros(self.n_features, dtype=np.float32)
        for f in range(self.n_features):
            # TODO: update to 'exact' when bug fix is released in scipy 1.5
            p_val[f] = ks_2samp(X_ref[:, f], X[:, f], alternative=self.alternative, mode='asymp')[1]
        return p_val 
Example #8
Source File: test_multivariate.py    From GraphicDesignPatternByPython with MIT License 6 votes vote down vote up
def test_pairwise_distances(self):
        # Test that the distribution of pairwise distances is close to correct.
        np.random.seed(514)

        def random_ortho(dim):
            u, _s, v = np.linalg.svd(np.random.normal(size=(dim, dim)))
            return np.dot(u, v)

        for dim in range(2, 6):
            def generate_test_statistics(rvs, N=1000, eps=1e-10):
                stats = np.array([
                    np.sum((rvs(dim=dim) - rvs(dim=dim))**2)
                    for _ in range(N)
                ])
                # Add a bit of noise to account for numeric accuracy.
                stats += np.random.uniform(-eps, eps, size=stats.shape)
                return stats

            expected = generate_test_statistics(random_ortho)
            actual = generate_test_statistics(scipy.stats.ortho_group.rvs)

            _D, p = scipy.stats.ks_2samp(expected, actual)

            assert_array_less(.05, p) 
Example #9
Source File: atlas3.py    From ssbio with MIT License 6 votes vote down vote up
def get_pca_ks_stats(self, maxrange=5):
        """Get a dictionary of PC#: K-S test stat for each """
        pc_to_phenotype_pairs = {}
        num_components = self.principal_observations_df.shape[1]
        if num_components < maxrange:
            maxrange = num_components

        phenotypes = self.principal_observations_df.phenotype.unique().tolist()
        for i in range(0, maxrange):
            phenotype_pair_to_ks = {}
            for p1, p2 in combinations(phenotypes, 2):
                p1_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p1].iloc[:,i].as_matrix()
                p2_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p2].iloc[:,i].as_matrix()
                phenotype_pair_to_ks[(p1, p2)] = ks_2samp(p1_pc, p2_pc)
            pc_to_phenotype_pairs[i + 1] = phenotype_pair_to_ks

        return pc_to_phenotype_pairs 
Example #10
Source File: test_mvknn.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_joint(knn_xz):
    # Simulate from the joint distribution of x,z (see
    # generate_real_nominal_data) and perform a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(knn_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    joint_samples = knn_xz.simulate(-1, [0,1], N=len(data))
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .05 < pvalue
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid() 
Example #11
Source File: test_mvkde.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_joint(kde_xz):
    # Simulate from the joint distribution of x,z (see
    # generate_real_nominal_data) and perform a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(kde_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    joint_samples = kde_xz.simulate(-1, [0,1], N=len(data))
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        _, p = ks_2samp(data_subpop[:,0], samples_subpop)
        assert .05 < p
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid() 
Example #12
Source File: test_mvkde.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_conditional_indicator(kde_xz):
    # Simulate from the conditional distribution of x|z (see
    # generate_real_nominal_data) and perfrom a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(kde_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of X Given Indicator Z')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            kde_xz.simulate(-1, [0], {1:t}, None, N=len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        _, p = ks_2samp(data_subpop[:,0], samples_subpop)
        assert .1 < p
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid() 
Example #13
Source File: test_normal_categorical.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_joint(state):
    # Simulate from the joint distribution of (x,z).
    joint_samples = state.simulate(-1, [0,1], N=N_SAMPLES)
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in INDICATORS:
        # Plot original data.
        data_subpop = DATA[DATA[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .05 < pvalue
    ax.set_xlabel('Indicator')
    ax.set_ylabel('x')
    ax.grid() 
Example #14
Source File: test_loom_simulate_bivariate_gaussian.py    From bayeslite with Apache License 2.0 6 votes vote down vote up
def test_simulate_y_from_partially_populated_fresh_row(seed):
    """Check that Loom conditions on partial observation in new rowid."""
    means = ((0,20), (20,0))
    sample_size = 50
    mix_ratio = [0.7, 0.3]
    table = 'data'

    with bayeslite.bayesdb_open(seed=seed) as bdb:
        sample_gaussians = axis_aligned_gaussians(means, sample_size, bdb._np_prng)
        samples = mix(sample_gaussians, mix_ratio, bdb._np_prng)
        register_loom(bdb)
        prepare_bdb(bdb, samples, table)

        rowid = insert_row(bdb, table, means[0][0], None)
        simulated_samples = simulate_from_rowid(bdb, table, 1, rowid,
            limit=sample_size)

    y_samples = [y for _x, y in sample_gaussians[0]]
    _statistic, p_value = stats.ks_2samp(y_samples, simulated_samples)
    assert 0.10 < p_value 
Example #15
Source File: test_normal_categorical.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def test_conditional_indicator(state):
    # Simulate from the conditional X|Z
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of Data X Given Indicator Z')
    for t in INDICATORS:
        # Plot original data.
        data_subpop = DATA[DATA[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            state.simulate(-1, [0], {1:t}, None, len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .01 < pvalue
    ax.set_xlabel('Indicator')
    ax.set_ylabel('x')
    ax.grid() 
Example #16
Source File: test_tableone.py    From tableone with MIT License 5 votes vote down vote up
def mytest(*args):
    """
    Hypothesis test for test_self_defined_statistical_tests
    """
    mytest.__name__ = "Test name"
    _, pval = stats.ks_2samp(*args)
    return pval 
Example #17
Source File: edgepy.py    From edgePy with MIT License 5 votes vote down vote up
def ks_2_samples(self):
        """Run a 2-tailed Kolmogorov-Smirnov test on the DGEList object.

        Args:
            None.

        Returns:
            gene_details: a dictionary of dictionary (key, gene), holding mean1 and mean2 for the two groups
            gene_likelihood: a dictionary (key, gene), holding the p-value of the separation of the two groups
            group_types: list of the groups in order.

        """
        gene_likelihood1: Dict[Hashable, float] = {}
        group_types = set(self.dge_list.groups_list)
        group_types = list(group_types)
        group_filters: Dict[Hashable, Any] = {}
        gene_details: Dict[Hashable, Dict[Hashable, Any]] = {}
        for group in group_types:
            group_filters[group] = [g == group for g in self.dge_list.groups_list]
        for gene_idx, gene in enumerate(self.dge_list.genes):
            gene_row = self.dge_list.counts[gene_idx]
            if len(group_types) == 2:
                group_data1 = gene_row.compress(group_filters[group_types[0]])
                mean1 = np.mean(group_data1)

                group_data2 = gene_row.compress(group_filters[group_types[1]])
                mean2 = np.mean(group_data2)

                gene_likelihood1[gene] = ks_2samp(group_data1, group_data2)[1]

                gene_details[gene] = {'mean1': mean1, 'mean2': mean2}
        return gene_details, gene_likelihood1, group_types 
Example #18
Source File: feature_selection.py    From default-credit-card-prediction with MIT License 5 votes vote down vote up
def kolmogorov_smirnov_two_sample_test(sample_a,sample_b):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter twoa samples are drawn from identical distributions

	Keyword arguments:
	sample_a -- The first sample
	sample_b -- The second sample
	"""

	return stats.ks_2samp(sample_a,sample_b) 
Example #19
Source File: metrics.py    From toad with MIT License 5 votes vote down vote up
def KS(score, target):
    """calculate ks value

    Args:
        score (array-like): list of score or probability that the model predict
        target (array-like): list of real target

    Returns:
        float: the max KS value
    """
    mask = target == 1
    res = ks_2samp(score[mask], score[~mask])
    return res[0] 
Example #20
Source File: test_mvkde.py    From cgpm with Apache License 2.0 5 votes vote down vote up
def test_univariate_two_sample(i):
    # This test ensures posterior sampling of uni/bimodal dists on R. When the
    # plot is shown, a density curve overlays the samples which is useful for
    # seeing that logpdf/simulate agree.
    N_SAMPLES = 100

    rng = gu.gen_rng(2)
    # Synthetic samples.
    samples_train = SAMPLES[i](N_SAMPLES, rng)
    samples_test = SAMPLES[i](N_SAMPLES, rng)
    # Univariate KDE.
    kde = MultivariateKde([3], None, distargs={O: {ST: [N], SA:[{}]}}, rng=rng)
    # Incorporate observations.
    for rowid, x in enumerate(samples_train):
        kde.incorporate(rowid, {3: x})
    # Run inference.
    kde.transition()
    # Generate posterior samples.
    samples_gen = [s[3] for s in kde.simulate(-1, [3], N=N_SAMPLES)]
    # Plot comparison of all train, test, and generated samples.
    fig, ax = plt.subplots()
    ax.scatter(samples_train, [0]*len(samples_train), color='b', label='Train')
    ax.scatter(samples_gen, [1]*len(samples_gen), color='r', label='KDE')
    ax.scatter(samples_test, [2]*len(samples_test), color='g', label='Test')
    # Overlay the density function.
    xs = np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 200)
    pdfs = [kde.logpdf(-1, {3: x}) for x in xs]
    # Convert the pdfs from the range to 1 to 1.5 by rescaling.
    pdfs_plot = np.exp(pdfs)+1
    pdfs_plot = (pdfs_plot/max(pdfs_plot)) * 1.5
    ax.plot(xs, pdfs_plot, color='k')
    # Clear up some labels.
    ax.set_title('Univariate KDE Posterior versus Generator')
    ax.set_xlabel('x')
    ax.set_yticklabels([])
    # Show the plot.
    ax.grid()
    plt.close()
    # KS test
    _, p = ks_2samp(samples_test, samples_gen)
    assert .05 < p 
Example #21
Source File: filters.py    From causallib with Apache License 2.0 5 votes vote down vote up
def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals 
Example #22
Source File: test_sample.py    From pyPESTO with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_prior():
    """Check that priors are defined for sampling."""
    # define negative log posterior
    posterior_fun = pypesto.Objective(fun=negative_log_posterior)

    # define negative log prior
    prior_fun = pypesto.Objective(fun=negative_log_prior)

    # define pypesto prior object
    prior_object = pypesto.NegLogPriors(objectives=[prior_fun])

    # define pypesto problem using prior object
    test_problem = pypesto.Problem(objective=posterior_fun,
                                   x_priors_defs=prior_object,
                                   lb=-10, ub=10,
                                   x_names=['x'])

    sampler = sample.AdaptiveMetropolisSampler()

    result = sample.sample(test_problem, n_samples=1e4, sampler=sampler,
                           x0=np.array([0.]))

    # get log prior values of first chain
    logprior_trace = -result.sample_result.trace_neglogprior[0, :]

    # check that not all entries are zero
    assert (logprior_trace != 0.).any()

    # get samples of first chain
    samples = result.sample_result.trace_x[0, :, 0]

    # generate ground-truth samples
    rvs = norm.rvs(size=5000, loc=-1., scale=np.sqrt(0.7))

    # check sample distribution agreement with the ground-truth
    statistic, pval = ks_2samp(rvs, samples)
    print(statistic, pval)

    assert statistic < 0.1 
Example #23
Source File: test_dc_stat_think.py    From dc_stat_think with MIT License 5 votes vote down vote up
def test_ks_stat(x):
    theor_data = np.random.normal(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.exponential(1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.logistic(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct) 
Example #24
Source File: test_dc_stat_think.py    From dc_stat_think with MIT License 5 votes vote down vote up
def test_pandas_conversion(seed):
    df = pd.DataFrame({'a': [3, 2, 1, 4],
                       'b': [8, 6, 7, 5],
                       'c': [9.1, 10.1, 11.1, np.nan]})

    x, y = dcst.ecdf(df.loc[:, 'a'])
    assert (x == np.array([1, 2, 3, 4])).all()
    assert (y == np.array([0.25, 0.5, 0.75, 1.0])).all()

    x, y = dcst.ecdf(df.loc[:, 'c'])
    assert np.allclose(x, np.array([9.1, 10.1, 11.1]))
    assert np.allclose(y, np.array([1/3, 2/3, 1.0]))

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=10), [np.nan]*990)),
        'b': np.random.normal(0, 1, size=1000)})
    correct, _ = st.ks_2samp(df['a'].dropna(), df['b'])
    assert np.isclose(dcst.ks_stat(df['a'], df['b']), correct)

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=80), [np.nan]*20)),
        'b': np.random.normal(0, 1, size=100)})
    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['a'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['a'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['b'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['b'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_perm_reps(df['a'].values, df['b'].values,
                                  dcst.diff_of_means, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_perm_reps(df['a'], df['b'],
                       dcst.diff_of_means, size=100), correct, atol=atol) 
Example #25
Source File: _dp_verification.py    From whitenoise-system with MIT License 5 votes vote down vote up
def ks_test(self, fD1, fD2):
        """
        K-S Two sample test between the repeated query results on neighboring datasets
        """
        return stats.ks_2samp(fD1, fD2) 
Example #26
Source File: test_dissimilarity.py    From flyingpigeon with Apache License 2.0 5 votes vote down vote up
def test_1D_ks_2samp(self):
        # Compare with scipy.stats.ks_2samp
        x = np.random.randn(50) + 1
        y = np.random.randn(50)
        s, p = stats.ks_2samp(x, y)
        dm = dd.kolmogorov_smirnov(x, y)
        aaeq(dm, s, 3) 
Example #27
Source File: sample_from_the_prior_test.py    From bilby with MIT License 5 votes vote down vote up
def ks_2samp_wrapper(data1, data2):
    if version.parse(scipy.__version__) >= version.parse("1.3.0"):
        return ks_2samp(data1, data2, alternative="two-sided", mode="asymp")
    else:
        return ks_2samp(data1, data2) 
Example #28
Source File: gw_utils_test.py    From bilby with MIT License 5 votes vote down vote up
def test_conversion_gives_correct_prior(self) -> None:
        zeniths = self.samples["zenith"]
        azimuths = self.samples["azimuth"]
        times = self.samples["time"]
        args = zip(*[
            (zenith, azimuth, time, self.ifos)
            for zenith, azimuth, time in zip(zeniths, azimuths, times)
        ])
        ras, decs = zip(*map(bilby.gw.utils.zenith_azimuth_to_ra_dec, *args))
        self.assertGreaterEqual(ks_2samp(self.samples["ra"], ras).pvalue, 0.01)
        self.assertGreaterEqual(ks_2samp(self.samples["dec"], decs).pvalue, 0.01) 
Example #29
Source File: burn_in.py    From pycbc with GNU General Public License v3.0 5 votes vote down vote up
def ks_test(samples1, samples2, threshold=0.9):
    """Applies a KS test to determine if two sets of samples are the same.

    The ks test is applied parameter-by-parameter. If the two-tailed p-value
    returned by the test is greater than ``threshold``, the samples are
    considered to be the same.

    Parameters
    ----------
    samples1 : dict
        Dictionary of mapping parameters to the first set of samples.
    samples2 : dict
        Dictionary of mapping parameters to the second set of samples.
    threshold : float
        The thershold to use for the p-value. Default is 0.9.

    Returns
    -------
    dict :
        Dictionary mapping parameter names to booleans indicating whether the
        given parameter passes the KS test.
    """
    is_the_same = {}
    assert set(samples1.keys()) == set(samples2.keys()), (
        "samples1 and 2 must have the same parameters")
    # iterate over the parameters
    for param in samples1:
        s1 = samples1[param]
        s2 = samples2[param]
        _, p_value = ks_2samp(s1, s2)
        is_the_same[param] = p_value > threshold
    return is_the_same 
Example #30
Source File: kswin.py    From scikit-multiflow with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def add_element(self, input_value):
        """ Add element to sliding window

        Adds an element on top of the sliding window and removes
        the oldest one from the window. Afterwards, the KS-test
        is performed.

        Parameters
        ----------
        input_value: ndarray
            New data sample the sliding window should add.
        """
        self.n += 1
        currentLength = self.window.shape[0]
        if currentLength >= self.window_size:
            self.window = np.delete(self.window,0)
            rnd_window = np.random.choice(self.window[:-self.stat_size], self.stat_size)

            (st, self.p_value) = stats.ks_2samp(rnd_window, self.window[-self.stat_size:],mode="exact")

            if self.p_value <= self.alpha and st > 0.1:
                self.change_detected = True
                self.window = self.window[-self.stat_size:]
            else:
                self.change_detected = False
        else: # Not enough samples in sliding window for a valid test
            self.change_detected = False

        self.window = np.concatenate([self.window,[input_value]])