Python Examples of numpy.ma.masked

Source File: mstats_basic.py From Computable with MIT License

6 votes

def kruskalwallis(*args):
    output = argstoarray(*args)
    ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
    sumrk = ranks.sum(-1)
    ngrp = ranks.count(-1)
    ntot = ranks.count()
#    ssbg = (sumrk**2/ranks.count(-1)).sum() - ranks.sum()**2/ntotal
#    H = ssbg / (ntotal*(ntotal+1)/12.)
    H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
    # Tie correction
    ties = count_tied_groups(ranks)
    T = 1. - np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')
    H /= T
    #
    df = len(output) - 1
    prob = stats.chisqprob(H,df)
    return (H, prob)

Source File: corex.py From discrete_sieve with Apache License 2.0

6 votes

def transform(self, X, details=False):
        """
        Label hidden factors for (possibly previously unseen) samples of data.
        Parameters: samples of data, X, shape = [n_samples, n_visible]
        Returns: , shape = [n_samples, n_hidden]
        """
        Xm = ma.masked_equal(X, self.missing_values)
        log_marg_x = self.calculate_marginals_on_samples(self.theta, Xm)
        p_y_given_x, log_z = self.calculate_latent(log_marg_x)
        labels = self.label(p_y_given_x)
        if details == 'surprise':
            # Totally experimental
            log_marg_x = self.calculate_marginals_on_samples(self.theta, Xm, return_ratio=False)
            n_samples = Xm.shape[0]
            surprise = []
            for l in range(n_samples):
                q = - sum([max([log_marg_x[j,l,i,labels[l, j]]
                                for j in range(self.n_hidden)])
                           for i in range(self.n_visible)])
                surprise.append(q)
            return p_y_given_x, log_z, np.array(surprise)
        elif details:
            return p_y_given_x, log_z
        else:
            return labels

Source File: corex.py From bio_corex with Apache License 2.0

6 votes

def transform(self, X, details=False):
        """
        Label hidden factors for (possibly previously unseen) samples of data.
        Parameters: samples of data, X, shape = [n_samples, n_visible]
        Returns: , shape = [n_samples, n_hidden]
        """
        Xm = ma.masked_equal(X, self.missing_values)
        p_y_given_x, log_z = self.calculate_latent(self.theta, Xm)
        labels = self.label(p_y_given_x)
        if details == 'surprise':
            # Totally experimental
            log_marg_x = self.calculate_marginals_on_samples(self.theta, Xm, return_ratio=False)
            n_samples = Xm.shape[0]
            surprise = []
            for l in range(n_samples):
                q = - sum([max([log_marg_x[j,l,i,labels[l, j]]
                                for j in range(self.n_hidden)])
                           for i in range(self.n_visible)])
                surprise.append(q)
            return p_y_given_x, log_z, np.array(surprise)
        elif details:
            return p_y_given_x, log_z
        else:
            return labels

Source File: test_filter.py From pfilter with MIT License

5 votes

def test_partial_missing():
    # check that
    pf = ParticleFilter(
        prior_fn=lambda n: np.random.normal(0, 1, (n, 4)), n_particles=100
    )
    for i in range(10):
        masked_input = ma.masked_equal(np.array([1, 999, 0, 999]), 999)
        pf.update(masked_input)
        pf.update(np.array([1, 1, 1, 1]))
        assert np.allclose(np.sum(pf.weights), 1.0)
        assert len(pf.weights) == len(pf.particles) == 100

Source File: mstats_basic.py From lambda-packs with MIT License

5 votes

def kruskal(*args):
    """
    Compute the Kruskal-Wallis H-test for independent samples

    Parameters
    ----------
    sample1, sample2, ... : array_like
       Two or more arrays with the sample measurements can be given as
       arguments.

    Returns
    -------
    statistic : float
       The Kruskal-Wallis H statistic, corrected for ties
    pvalue : float
       The p-value for the test using the assumption that H has a chi
       square distribution

    Notes
    -----
    For more details on `kruskal`, see `stats.kruskal`.

    """
    output = argstoarray(*args)
    ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
    sumrk = ranks.sum(-1)
    ngrp = ranks.count(-1)
    ntot = ranks.count()
    H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
    # Tie correction
    ties = count_tied_groups(ranks)
    T = 1. - np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    H /= T
    df = len(output) - 1
    prob = distributions.chi2.sf(H, df)
    return KruskalResult(H, prob)

Source File: mstats_basic.py From GraphicDesignPatternByPython with MIT License

5 votes

def kruskal(*args):
    """
    Compute the Kruskal-Wallis H-test for independent samples

    Parameters
    ----------
    sample1, sample2, ... : array_like
       Two or more arrays with the sample measurements can be given as
       arguments.

    Returns
    -------
    statistic : float
       The Kruskal-Wallis H statistic, corrected for ties
    pvalue : float
       The p-value for the test using the assumption that H has a chi
       square distribution

    Notes
    -----
    For more details on `kruskal`, see `stats.kruskal`.

    """
    output = argstoarray(*args)
    ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
    sumrk = ranks.sum(-1)
    ngrp = ranks.count(-1)
    ntot = ranks.count()
    H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
    # Tie correction
    ties = count_tied_groups(ranks)
    T = 1. - sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    H /= T
    df = len(output) - 1
    prob = distributions.chi2.sf(H, df)
    return KruskalResult(H, prob)

Source File: masks.py From argos with GNU General Public License v3.0

5 votes

def maskedEqual(array, missingValue):
    """ Mask an array where equal to a given (missing)value.

        Unfortunately ma.masked_equal does not work with structured arrays. See:
        https://mail.scipy.org/pipermail/numpy-discussion/2011-July/057669.html

        If the data is a structured array the mask is applied for every field (i.e. forming a
        logical-and). Otherwise ma.masked_equal is called.
    """
    if array_is_structured(array):
        # Enforce the array to be masked
        if not isinstance(array, ma.MaskedArray):
            array = ma.MaskedArray(array)

        # Set the mask separately per field
        for nr, field in enumerate(array.dtype.names):
            if hasattr(missingValue, '__len__'):
                fieldMissingValue = missingValue[nr]
            else:
                fieldMissingValue = missingValue

            array[field] = ma.masked_equal(array[field], fieldMissingValue)

        check_class(array, ma.MaskedArray) # post-condition check
        return array
    else:
        # masked_equal works with missing is None
        result = ma.masked_equal(array, missingValue, copy=False)
        check_class(result, ma.MaskedArray) # post-condition check
        return result

Source File: mstats_basic.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def kruskal(*args):
    """
    Compute the Kruskal-Wallis H-test for independent samples

    Parameters
    ----------
    sample1, sample2, ... : array_like
       Two or more arrays with the sample measurements can be given as
       arguments.

    Returns
    -------
    statistic : float
       The Kruskal-Wallis H statistic, corrected for ties
    pvalue : float
       The p-value for the test using the assumption that H has a chi
       square distribution

    Notes
    -----
    For more details on `kruskal`, see `stats.kruskal`.

    """
    output = argstoarray(*args)
    ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
    sumrk = ranks.sum(-1)
    ngrp = ranks.count(-1)
    ntot = ranks.count()
    H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
    # Tie correction
    ties = count_tied_groups(ranks)
    T = 1. - np.sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot)
    if T == 0:
        raise ValueError('All numbers are identical in kruskal')

    H /= T
    df = len(output) - 1
    prob = distributions.chi2.sf(H, df)
    return KruskalResult(H, prob)

Source File: vis_corex.py From bio_corex with Apache License 2.0

5 votes

def calculate_log_latent(corex, X):
    """"Calculate the log probabilities for hidden factors for each sample, with high precision."""
    Xm = ma.masked_equal(X, corex.missing_values)
    n_samples, n_visible = Xm.shape
    n_hidden, dim_hidden, ram = corex.n_hidden, corex.dim_hidden, corex.ram
    log_p_y_given_x_unnorm = np.empty((n_hidden, n_samples, dim_hidden))
    memory_size = float(n_samples * n_visible * n_hidden * dim_hidden * 64) / 1000**3  # GB
    batch_size = np.clip(int(ram * n_samples / memory_size), 1, n_samples)
    for l in range(0, n_samples, batch_size):
        log_marg_x = corex.calculate_marginals_on_samples(corex.theta, Xm[l:l+batch_size])  # LLRs for each sample, for each var.
        log_p_y_given_x_unnorm[:, l:l+batch_size, :] = corex.log_p_y + np.einsum('ikl,ijkl->ijl', corex.alpha, log_marg_x, optimize=False)
    return normalize_latent(log_p_y_given_x_unnorm, n_hidden)

Source File: _gridprop_import_grdecl.py From xtgeo with GNU Lesser General Public License v3.0

5 votes

def import_grdecl_prop(self, pfile, name="unknown", grid=None):
    """Read a GRDECL ASCII property record"""

    if grid is None:
        raise ValueError("A grid instance is required as argument")

    self._ncol = grid.ncol
    self._nrow = grid.nrow
    self._nlay = grid.nlay
    self._name = name
    self._filesrc = pfile
    actnumv = grid.get_actnum().values

    # This requires that the Python part clean up comments
    # etc, and make a tmp file.
    fds, tmpfile = mkstemp(prefix="tmpxtgeo")
    os.close(fds)

    with open(pfile.name, "r") as oldfile, open(tmpfile, "w") as newfile:
        for line in oldfile:
            if not (re.search(r"^--", line) or re.search(r"^\s+$", line)):
                newfile.write(line)

    # now read the property
    nlen = self._ncol * self._nrow * self._nlay
    ier, values = _cxtgeo.grd3d_import_grdecl_prop(
        tmpfile, self._ncol, self._nrow, self._nlay, name, nlen, 0,
    )

    os.remove(tmpfile)

    if ier != 0:
        raise xtgeo.KeywordNotFoundError(
            "Cannot import {}, not present in file {}?".format(name, pfile)
        )

    self.values = values.reshape(self.dimensions)
    self.values = ma.masked_equal(self.values, actnumv == 0)

Source File: corex.py From discrete_sieve with Apache License 2.0

4 votes

def fit_transform(self, X):
        """Fit CorEx on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_visible]
            The data.

        Returns
        -------
        Y: array-like, shape = [n_samples, n_hidden]
           Learned values for each latent factor for each sample.
           Y's are sorted so that Y_1 explains most correlation, etc.
        """

        Xm = ma.masked_equal(X, self.missing_values)

        best_tc = -np.inf
        for n_rep in range(self.n_repeat):

            self.initialize_parameters(X)

            for nloop in range(self.max_iter):

                self.log_p_y = self.calculate_p_y(self.p_y_given_x)
                self.theta = self.calculate_theta(Xm, self.p_y_given_x)

                log_marg_x = self.calculate_marginals_on_samples(self.theta, Xm)  # LLRs for each sample, for each var.

                self.p_y_given_x, self.log_z = self.calculate_latent(log_marg_x)

                self.update_tc(self.log_z)  # Calculate TC and record history to check convergence

                self.print_verbose()
                if self.convergence():
                    break

            if self.tc > best_tc:
                best_tc = self.tc
                best_dict = self.__dict__.copy()
        self.__dict__ = best_dict
        if self.verbose:
            print 'Best tc:', self.tc

        self.sort_and_output(Xm)

        return self.labels

Source File: corex.py From bio_corex with Apache License 2.0

4 votes

def fit_transform(self, X):
        """Fit CorEx on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_visible]
            The data.

        Returns
        -------
        Y: array-like, shape = [n_samples, n_hidden]
           Learned values for each latent factor for each sample.
           Y's are sorted so that Y_1 explains most correlation, etc.
        """

        if self.n_cpu == 1:
            self.pool = None
        else:
            self.pool = Pool(self.n_cpu)
        Xm = ma.masked_equal(X, self.missing_values)
        best_tc = -np.inf
        for n_rep in range(self.n_repeat):

            self.initialize_parameters(X)

            for nloop in range(self.max_iter):

                self.log_p_y = self.calculate_p_y(self.p_y_given_x)
                self.theta = self.calculate_theta(Xm, self.p_y_given_x)

                if self.n_hidden > 1:  # Structure learning step
                    self.update_alpha(self.p_y_given_x, self.theta, Xm, self.tcs)

                self.p_y_given_x, self.log_z = self.calculate_latent(self.theta, Xm)

                self.update_tc(self.log_z)  # Calculate TC and record history to check convergence

                self.print_verbose()
                if self.convergence():
                    break

            if self.verbose:
                print('Overall tc:', self.tc)
            if self.tc > best_tc:
                best_tc = self.tc
                best_dict = self.__dict__.copy()  # TODO: what happens if n_cpu > 1 and n_repeat > 1? Does pool get copied? Probably not...just a pointer to the same object... Seems fine.
        self.__dict__ = best_dict
        if self.verbose:
            print('Best tc:', self.tc)

        self.sort_and_output(Xm)
        if self.pool is not None:
            self.pool.close()
            self.pool = None
        return self.labels

Python numpy.ma.masked_equal() Examples