Python Examples of scipy.stats.ks

Source File: test_mvknn.py From cgpm with Apache License 2.0

6 votes

def test_conditional_indicator(knn_xz):
    # Simulate from the conditional distribution of x|z (see
    # generate_real_nominal_data) and perfrom a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(knn_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of X Given Indicator Z')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            knn_xz.simulate(-1, [0], constraints={1:t}, N=len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .1 < pvalue
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid()

Source File: compare_genomes.py From mCaller with MIT License

6 votes

def compare_by_position(bed1,bed2,xmfa):
    pos_dict = {}

    for i,bed in enumerate([bed1,bed2]):
        pos_dict[i] = {}
        with open(bed,'r') as fi:
                for line in fi:
                #2  1892198 1892199 TCMMTMTTMMM 0.5 -   16
                    csome,start,end,motif,perc_meth,strand,num_reads,probabilities = tuple(line.split('\t'))
                    pos_dict[i][(csome,start,end,strand)] = ((perc_meth,num_reads),np.asarray([float(p) for p in probabilities.strip().split(',')]))

    for pos in pos_dict[0]:
        if pos in pos_dict[1]:
            try:
                u,pval = mannwhitneyu(pos_dict[0][pos][1],pos_dict[0][pos][1],alternative='two-sided')
            except ValueError:
                u,pval = 'none','identical'
            u2,pval2 = ranksums(pos_dict[0][pos][1],pos_dict[0][pos][1])
            try:
                t,pval3 = ttest_ind(pos_dict[0][pos][1],pos_dict[0][pos][1])
            except:
                t,pval3 = 'none','missing df'
            d,pval4 = ks_2samp(pos_dict[0][pos][1],pos_dict[0][pos][1])
            if pval4 < 0.9:
                print pos, pos_dict[0][pos][0], pos_dict[1][pos][0], pval, pval2, pval3, pval4

Source File: test_weighted_statistics.py From pyABC with BSD 3-Clause "New" or "Revised" License

6 votes

def test_resample_deterministic():
    """
    Test the deterministic resampling routine.
    """
    nw = 50  # number of weighed points
    points = np.random.randn(nw)
    weights = np.random.rand(nw)
    weights /= np.sum(weights)

    n = 1000  # number of non-weighted points
    resampled_det = ws.resample_deterministic(points, weights, n, False)

    resampled = ws.resample(points, weights, n)

    # should be same distribution
    _, p = ks_2samp(resampled_det, resampled)
    assert p > 1e-2

    resampled_det2 = ws.resample_deterministic(points, weights, n, True)
    assert len(resampled_det2) == n

    _, p = ks_2samp(resampled_det2, resampled)
    assert p > 1e-2

Source File: test_weighted_statistics.py From pyABC with BSD 3-Clause "New" or "Revised" License

6 votes

def test_resample():
    """
    Test that the resampling process yields consistent distributions,
    using a KS test.
    """
    nw = 50  # number of weighted points
    points = np.random.randn(nw)
    weights = np.random.rand(nw)
    weights /= np.sum(weights)

    n = 1000  # number of non-weighted points
    # sample twice from same samples
    resampled1 = ws.resample(points, weights, n)
    resampled2 = ws.resample(points, weights, n)

    # should be same distribution
    _, p = ks_2samp(resampled1, resampled2)
    assert p > 1e-2

    # use different points
    points3 = np.random.randn(nw)
    resampled3 = ws.resample(points3, weights, n)
    # should be different distributions
    _, p = ks_2samp(resampled1, resampled3)
    assert p < 1e-2

Source File: stat_utils.py From causallib with Apache License 2.0

6 votes

def calc_weighted_ks2samp(x, y, wx, wy):
    """
    Weighted Kolmogorov-Smirnov

    References:
        [1] https://stackoverflow.com/a/40059727
    """
    x_ix = np.argsort(x)
    y_ix = np.argsort(y)
    x, wx = x[x_ix], wx[x_ix]
    y, wy = y[y_ix], wy[y_ix]
    data = np.concatenate((x, y))
    wx_cum = np.hstack([0, wx.cumsum() / wx.sum()])
    wy_cum = np.hstack([0, wy.cumsum() / wy.sum()])
    # Align the "steps" between the two distribution so the differences will be well defined:
    x_align = wx_cum[[np.searchsorted(x, data, side="right")]]
    y_align = wy_cum[[np.searchsorted(y, data, side="right")]]
    stat = np.max(np.abs(x_align - y_align))
    # stat = ks_2samp(wx * x, wy * y)
    return stat

Source File: feature_selection.py From default-credit-card-prediction with MIT License

6 votes

def kolmogorov_smirnov_two_sample_test(X,y):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter feature values of each class are drawn from identical distributions

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""

	kolmogorov_smirnov=[[(0,0)]]*len(X[0])
	# print kolmogorov_smirnov
	for feature_col in xrange(len(X[0])):
			ks_test_statistic,p_value=stats.ks_2samp(X[y==0,feature_col],X[y==1,feature_col])
			kolmogorov_smirnov[feature_col]=(ks_test_statistic,p_value)

	#debug
	for f in xrange(23):
		print kolmogorov_smirnov[f]

	return kolmogorov_smirnov

Source File: ks.py From alibi-detect with Apache License 2.0

6 votes

def feature_score(self, X_ref: np.ndarray, X: np.ndarray) -> np.ndarray:
        """
        Compute K-S scores per feature.

        Parameters
        ----------
        X_ref
            Reference instances to compare distribution with.
        X
            Batch of instances.

        Returns
        -------
        Feature level drift scores.
        """
        X = X.reshape(X.shape[0], -1)
        X_ref = X_ref.reshape(X_ref.shape[0], -1)
        p_val = np.zeros(self.n_features, dtype=np.float32)
        for f in range(self.n_features):
            # TODO: update to 'exact' when bug fix is released in scipy 1.5
            p_val[f] = ks_2samp(X_ref[:, f], X[:, f], alternative=self.alternative, mode='asymp')[1]
        return p_val

Source File: test_multivariate.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_pairwise_distances(self):
        # Test that the distribution of pairwise distances is close to correct.
        np.random.seed(514)

        def random_ortho(dim):
            u, _s, v = np.linalg.svd(np.random.normal(size=(dim, dim)))
            return np.dot(u, v)

        for dim in range(2, 6):
            def generate_test_statistics(rvs, N=1000, eps=1e-10):
                stats = np.array([
                    np.sum((rvs(dim=dim) - rvs(dim=dim))**2)
                    for _ in range(N)
                ])
                # Add a bit of noise to account for numeric accuracy.
                stats += np.random.uniform(-eps, eps, size=stats.shape)
                return stats

            expected = generate_test_statistics(random_ortho)
            actual = generate_test_statistics(scipy.stats.ortho_group.rvs)

            _D, p = scipy.stats.ks_2samp(expected, actual)

            assert_array_less(.05, p)

Source File: atlas3.py From ssbio with MIT License

6 votes

def get_pca_ks_stats(self, maxrange=5):
        """Get a dictionary of PC#: K-S test stat for each """
        pc_to_phenotype_pairs = {}
        num_components = self.principal_observations_df.shape[1]
        if num_components < maxrange:
            maxrange = num_components

        phenotypes = self.principal_observations_df.phenotype.unique().tolist()
        for i in range(0, maxrange):
            phenotype_pair_to_ks = {}
            for p1, p2 in combinations(phenotypes, 2):
                p1_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p1].iloc[:,i].as_matrix()
                p2_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p2].iloc[:,i].as_matrix()
                phenotype_pair_to_ks[(p1, p2)] = ks_2samp(p1_pc, p2_pc)
            pc_to_phenotype_pairs[i + 1] = phenotype_pair_to_ks

        return pc_to_phenotype_pairs

Source File: test_mvknn.py From cgpm with Apache License 2.0

6 votes

def test_joint(knn_xz):
    # Simulate from the joint distribution of x,z (see
    # generate_real_nominal_data) and perform a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(knn_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    joint_samples = knn_xz.simulate(-1, [0,1], N=len(data))
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .05 < pvalue
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid()

Source File: test_mvkde.py From cgpm with Apache License 2.0

6 votes

def test_joint(kde_xz):
    # Simulate from the joint distribution of x,z (see
    # generate_real_nominal_data) and perform a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(kde_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    joint_samples = kde_xz.simulate(-1, [0,1], N=len(data))
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        _, p = ks_2samp(data_subpop[:,0], samples_subpop)
        assert .05 < p
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid()

Source File: test_mvkde.py From cgpm with Apache License 2.0

6 votes

def test_conditional_indicator(kde_xz):
    # Simulate from the conditional distribution of x|z (see
    # generate_real_nominal_data) and perfrom a KS tests at each of the
    # subpopulations at the six levels of z.

    data = np.asarray(kde_xz.data.values())
    indicators = sorted(set(data[:,1].astype(int)))
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of X Given Indicator Z')
    for t in indicators:
        # Plot original data.
        data_subpop = data[data[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            kde_xz.simulate(-1, [0], {1:t}, None, N=len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        _, p = ks_2samp(data_subpop[:,0], samples_subpop)
        assert .1 < p
    ax.set_xlabel('z')
    ax.set_ylabel('x')
    ax.grid()

Source File: test_normal_categorical.py From cgpm with Apache License 2.0

6 votes

def test_joint(state):
    # Simulate from the joint distribution of (x,z).
    joint_samples = state.simulate(-1, [0,1], N=N_SAMPLES)
    _, ax = plt.subplots()
    ax.set_title('Joint Simulation')
    for t in INDICATORS:
        # Plot original data.
        data_subpop = DATA[DATA[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data for indicator t.
        samples_subpop = [j[0] for j in joint_samples if j[1] == t]
        ax.scatter(
            np.add([t]*len(samples_subpop), .25), samples_subpop,
            color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .05 < pvalue
    ax.set_xlabel('Indicator')
    ax.set_ylabel('x')
    ax.grid()

Source File: test_loom_simulate_bivariate_gaussian.py From bayeslite with Apache License 2.0

6 votes

def test_simulate_y_from_partially_populated_fresh_row(seed):
    """Check that Loom conditions on partial observation in new rowid."""
    means = ((0,20), (20,0))
    sample_size = 50
    mix_ratio = [0.7, 0.3]
    table = 'data'

    with bayeslite.bayesdb_open(seed=seed) as bdb:
        sample_gaussians = axis_aligned_gaussians(means, sample_size, bdb._np_prng)
        samples = mix(sample_gaussians, mix_ratio, bdb._np_prng)
        register_loom(bdb)
        prepare_bdb(bdb, samples, table)

        rowid = insert_row(bdb, table, means[0][0], None)
        simulated_samples = simulate_from_rowid(bdb, table, 1, rowid,
            limit=sample_size)

    y_samples = [y for _x, y in sample_gaussians[0]]
    _statistic, p_value = stats.ks_2samp(y_samples, simulated_samples)
    assert 0.10 < p_value

Source File: test_normal_categorical.py From cgpm with Apache License 2.0

6 votes

def test_conditional_indicator(state):
    # Simulate from the conditional X|Z
    _, ax = plt.subplots()
    ax.set_title('Conditional Simulation Of Data X Given Indicator Z')
    for t in INDICATORS:
        # Plot original data.
        data_subpop = DATA[DATA[:,1] == t]
        ax.scatter(data_subpop[:,1], data_subpop[:,0], color=gu.colors[t])
        # Plot simulated data.
        samples_subpop = [s[0] for s in
            state.simulate(-1, [0], {1:t}, None, len(data_subpop))]
        ax.scatter(
            np.repeat(t, len(data_subpop)) + .25,
            samples_subpop, color=gu.colors[t])
        # KS test.
        pvalue = ks_2samp(data_subpop[:,0], samples_subpop)[1]
        assert .01 < pvalue
    ax.set_xlabel('Indicator')
    ax.set_ylabel('x')
    ax.grid()

Source File: test_tableone.py From tableone with MIT License

5 votes

def mytest(*args):
    """
    Hypothesis test for test_self_defined_statistical_tests
    """
    mytest.__name__ = "Test name"
    _, pval = stats.ks_2samp(*args)
    return pval

Source File: edgepy.py From edgePy with MIT License

5 votes

def ks_2_samples(self):
        """Run a 2-tailed Kolmogorov-Smirnov test on the DGEList object.

        Args:
            None.

        Returns:
            gene_details: a dictionary of dictionary (key, gene), holding mean1 and mean2 for the two groups
            gene_likelihood: a dictionary (key, gene), holding the p-value of the separation of the two groups
            group_types: list of the groups in order.

        """
        gene_likelihood1: Dict[Hashable, float] = {}
        group_types = set(self.dge_list.groups_list)
        group_types = list(group_types)
        group_filters: Dict[Hashable, Any] = {}
        gene_details: Dict[Hashable, Dict[Hashable, Any]] = {}
        for group in group_types:
            group_filters[group] = [g == group for g in self.dge_list.groups_list]
        for gene_idx, gene in enumerate(self.dge_list.genes):
            gene_row = self.dge_list.counts[gene_idx]
            if len(group_types) == 2:
                group_data1 = gene_row.compress(group_filters[group_types[0]])
                mean1 = np.mean(group_data1)

                group_data2 = gene_row.compress(group_filters[group_types[1]])
                mean2 = np.mean(group_data2)

                gene_likelihood1[gene] = ks_2samp(group_data1, group_data2)[1]

                gene_details[gene] = {'mean1': mean1, 'mean2': mean2}
        return gene_details, gene_likelihood1, group_types

Source File: feature_selection.py From default-credit-card-prediction with MIT License

5 votes

def kolmogorov_smirnov_two_sample_test(sample_a,sample_b):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter twoa samples are drawn from identical distributions

	Keyword arguments:
	sample_a -- The first sample
	sample_b -- The second sample
	"""

	return stats.ks_2samp(sample_a,sample_b)

Source File: metrics.py From toad with MIT License

5 votes

def KS(score, target):
    """calculate ks value

    Args:
        score (array-like): list of score or probability that the model predict
        target (array-like): list of real target

    Returns:
        float: the max KS value
    """
    mask = target == 1
    res = ks_2samp(score[mask], score[~mask])
    return res[0]

Source File: test_mvkde.py From cgpm with Apache License 2.0

5 votes

def test_univariate_two_sample(i):
    # This test ensures posterior sampling of uni/bimodal dists on R. When the
    # plot is shown, a density curve overlays the samples which is useful for
    # seeing that logpdf/simulate agree.
    N_SAMPLES = 100

    rng = gu.gen_rng(2)
    # Synthetic samples.
    samples_train = SAMPLES[i](N_SAMPLES, rng)
    samples_test = SAMPLES[i](N_SAMPLES, rng)
    # Univariate KDE.
    kde = MultivariateKde([3], None, distargs={O: {ST: [N], SA:[{}]}}, rng=rng)
    # Incorporate observations.
    for rowid, x in enumerate(samples_train):
        kde.incorporate(rowid, {3: x})
    # Run inference.
    kde.transition()
    # Generate posterior samples.
    samples_gen = [s[3] for s in kde.simulate(-1, [3], N=N_SAMPLES)]
    # Plot comparison of all train, test, and generated samples.
    fig, ax = plt.subplots()
    ax.scatter(samples_train, [0]*len(samples_train), color='b', label='Train')
    ax.scatter(samples_gen, [1]*len(samples_gen), color='r', label='KDE')
    ax.scatter(samples_test, [2]*len(samples_test), color='g', label='Test')
    # Overlay the density function.
    xs = np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 200)
    pdfs = [kde.logpdf(-1, {3: x}) for x in xs]
    # Convert the pdfs from the range to 1 to 1.5 by rescaling.
    pdfs_plot = np.exp(pdfs)+1
    pdfs_plot = (pdfs_plot/max(pdfs_plot)) * 1.5
    ax.plot(xs, pdfs_plot, color='k')
    # Clear up some labels.
    ax.set_title('Univariate KDE Posterior versus Generator')
    ax.set_xlabel('x')
    ax.set_yticklabels([])
    # Show the plot.
    ax.grid()
    plt.close()
    # KS test
    _, p = ks_2samp(samples_test, samples_gen)
    assert .05 < p

Source File: filters.py From causallib with Apache License 2.0

5 votes

def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals

Source File: test_sample.py From pyPESTO with BSD 3-Clause "New" or "Revised" License

5 votes

def test_prior():
    """Check that priors are defined for sampling."""
    # define negative log posterior
    posterior_fun = pypesto.Objective(fun=negative_log_posterior)

    # define negative log prior
    prior_fun = pypesto.Objective(fun=negative_log_prior)

    # define pypesto prior object
    prior_object = pypesto.NegLogPriors(objectives=[prior_fun])

    # define pypesto problem using prior object
    test_problem = pypesto.Problem(objective=posterior_fun,
                                   x_priors_defs=prior_object,
                                   lb=-10, ub=10,
                                   x_names=['x'])

    sampler = sample.AdaptiveMetropolisSampler()

    result = sample.sample(test_problem, n_samples=1e4, sampler=sampler,
                           x0=np.array([0.]))

    # get log prior values of first chain
    logprior_trace = -result.sample_result.trace_neglogprior[0, :]

    # check that not all entries are zero
    assert (logprior_trace != 0.).any()

    # get samples of first chain
    samples = result.sample_result.trace_x[0, :, 0]

    # generate ground-truth samples
    rvs = norm.rvs(size=5000, loc=-1., scale=np.sqrt(0.7))

    # check sample distribution agreement with the ground-truth
    statistic, pval = ks_2samp(rvs, samples)
    print(statistic, pval)

    assert statistic < 0.1

Source File: test_dc_stat_think.py From dc_stat_think with MIT License

5 votes

def test_ks_stat(x):
    theor_data = np.random.normal(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.exponential(1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.logistic(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

Source File: test_dc_stat_think.py From dc_stat_think with MIT License

5 votes

def test_pandas_conversion(seed):
    df = pd.DataFrame({'a': [3, 2, 1, 4],
                       'b': [8, 6, 7, 5],
                       'c': [9.1, 10.1, 11.1, np.nan]})

    x, y = dcst.ecdf(df.loc[:, 'a'])
    assert (x == np.array([1, 2, 3, 4])).all()
    assert (y == np.array([0.25, 0.5, 0.75, 1.0])).all()

    x, y = dcst.ecdf(df.loc[:, 'c'])
    assert np.allclose(x, np.array([9.1, 10.1, 11.1]))
    assert np.allclose(y, np.array([1/3, 2/3, 1.0]))

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=10), [np.nan]*990)),
        'b': np.random.normal(0, 1, size=1000)})
    correct, _ = st.ks_2samp(df['a'].dropna(), df['b'])
    assert np.isclose(dcst.ks_stat(df['a'], df['b']), correct)

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=80), [np.nan]*20)),
        'b': np.random.normal(0, 1, size=100)})
    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['a'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['a'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['b'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['b'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_perm_reps(df['a'].values, df['b'].values,
                                  dcst.diff_of_means, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_perm_reps(df['a'], df['b'],
                       dcst.diff_of_means, size=100), correct, atol=atol)

Source File: _dp_verification.py From whitenoise-system with MIT License

5 votes

def ks_test(self, fD1, fD2):
        """
        K-S Two sample test between the repeated query results on neighboring datasets
        """
        return stats.ks_2samp(fD1, fD2)

Source File: test_dissimilarity.py From flyingpigeon with Apache License 2.0

5 votes

def test_1D_ks_2samp(self):
        # Compare with scipy.stats.ks_2samp
        x = np.random.randn(50) + 1
        y = np.random.randn(50)
        s, p = stats.ks_2samp(x, y)
        dm = dd.kolmogorov_smirnov(x, y)
        aaeq(dm, s, 3)

Source File: sample_from_the_prior_test.py From bilby with MIT License

5 votes

def ks_2samp_wrapper(data1, data2):
    if version.parse(scipy.__version__) >= version.parse("1.3.0"):
        return ks_2samp(data1, data2, alternative="two-sided", mode="asymp")
    else:
        return ks_2samp(data1, data2)

Source File: gw_utils_test.py From bilby with MIT License

5 votes

def test_conversion_gives_correct_prior(self) -> None:
        zeniths = self.samples["zenith"]
        azimuths = self.samples["azimuth"]
        times = self.samples["time"]
        args = zip(*[
            (zenith, azimuth, time, self.ifos)
            for zenith, azimuth, time in zip(zeniths, azimuths, times)
        ])
        ras, decs = zip(*map(bilby.gw.utils.zenith_azimuth_to_ra_dec, *args))
        self.assertGreaterEqual(ks_2samp(self.samples["ra"], ras).pvalue, 0.01)
        self.assertGreaterEqual(ks_2samp(self.samples["dec"], decs).pvalue, 0.01)

Source File: burn_in.py From pycbc with GNU General Public License v3.0

5 votes

def ks_test(samples1, samples2, threshold=0.9):
    """Applies a KS test to determine if two sets of samples are the same.

    The ks test is applied parameter-by-parameter. If the two-tailed p-value
    returned by the test is greater than ``threshold``, the samples are
    considered to be the same.

    Parameters
    ----------
    samples1 : dict
        Dictionary of mapping parameters to the first set of samples.
    samples2 : dict
        Dictionary of mapping parameters to the second set of samples.
    threshold : float
        The thershold to use for the p-value. Default is 0.9.

    Returns
    -------
    dict :
        Dictionary mapping parameter names to booleans indicating whether the
        given parameter passes the KS test.
    """
    is_the_same = {}
    assert set(samples1.keys()) == set(samples2.keys()), (
        "samples1 and 2 must have the same parameters")
    # iterate over the parameters
    for param in samples1:
        s1 = samples1[param]
        s2 = samples2[param]
        _, p_value = ks_2samp(s1, s2)
        is_the_same[param] = p_value > threshold
    return is_the_same

Source File: kswin.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License

5 votes

def add_element(self, input_value):
        """ Add element to sliding window

        Adds an element on top of the sliding window and removes
        the oldest one from the window. Afterwards, the KS-test
        is performed.

        Parameters
        ----------
        input_value: ndarray
            New data sample the sliding window should add.
        """
        self.n += 1
        currentLength = self.window.shape[0]
        if currentLength >= self.window_size:
            self.window = np.delete(self.window,0)
            rnd_window = np.random.choice(self.window[:-self.stat_size], self.stat_size)

            (st, self.p_value) = stats.ks_2samp(rnd_window, self.window[-self.stat_size:],mode="exact")

            if self.p_value <= self.alpha and st > 0.1:
                self.change_detected = True
                self.window = self.window[-self.stat_size:]
            else:
                self.change_detected = False
        else: # Not enough samples in sliding window for a valid test
            self.change_detected = False

        self.window = np.concatenate([self.window,[input_value]])

Python scipy.stats.ks_2samp() Examples