Python Examples of scipy.stats.percentileofscore

Source File: test_aggregator.py From pywr with GNU General Public License v3.0

6 votes

def agg_func(request):
    agg_func_name = request.param

    if agg_func_name == "custom":
        # When using custom you assign the function rather than a string.
        agg_func_name = npy_func = custom_test_func
    elif agg_func_name == "percentile":
        agg_func_name = {
            "func": "percentile",
            "args": [95],
            "kwargs": {}
        }
        npy_func = partial(np.percentile, q=95)
    elif agg_func_name == "percentileofscore":
        agg_func_name = {
            "func": "percentileofscore",
            "kwargs": {
                "score": 0.5,
                "kind": "rank"
            }
        }
        npy_func = partial(percentileofscore_with_axis, score=0.5, kind="rank")
    else:
        npy_func = npy_funcs[agg_func_name]
    return agg_func_name, npy_func

Source File: predict.py From PIDGINv3 with GNU General Public License v3.0

6 votes

def doPercentileCalculation(model_name):
	global rdkit_mols
	#expensive to unzip training file - so only done if smiles requested
	if options.ad_smiles:
		smiles = get_training_smiles(model_name)
	ad_data = getAdData(model_name)
	def calcPercentile(rdkit_mol):
		sims = DataStructs.BulkTanimotoSimilarity(rdkit_mol,ad_data[:,0])
		bias = ad_data[:,2].astype(float)
		std_dev = ad_data[:,3].astype(float)
		scores = ad_data[:,5].astype(float)
		weights = sims / (bias * std_dev)
		critical_weight = weights.max()
		percentile = percentileofscore(scores,critical_weight)
		if options.ad_smiles:
			critical_smiles = smiles[np.argmax(weights)]
			result = percentile, critical_smiles
		else:
			result = percentile, None
		return result
	ret = [calcPercentile(x) for x in rdkit_mols]
	return model_name, ret

#prediction runner for percentile calculation

Source File: ABuPtPosition.py From abu with GNU General Public License v3.0

6 votes

def fit_position(self, factor_object):
        """
        针对均值回复类型策略的仓位管理：
        根据当前买入价格在过去一段金融序列中的价格rank位置来决定仓位
        fit_position计算的结果是买入多少个单位（股，手，顿，合约）
        :param factor_object: ABuFactorBuyBases子类实例对象
        :return:买入多少个单位（股，手，顿，合约）
        """

        # self.kl_pd_buy为买入当天的数据，获取之前的past_day_cnt天数据
        last_kl = factor_object.past_today_kl(self.kl_pd_buy, self.past_day_cnt)
        if last_kl is None or last_kl.empty:
            precent_pos = self.pos_base
        else:
            # 使用percentileofscore计算买入价格在过去的past_day_cnt天的价格位置
            precent_pos = stats.percentileofscore(last_kl.close, self.bp)
            precent_pos = (1 + (self.mid_precent - precent_pos) / 100) * self.pos_base
        # 最大仓位限制，依然受上层最大仓位控制限制，eg：如果算出全仓，依然会减少到75%，如修改需要修改最大仓位值
        precent_pos = self.pos_max if precent_pos > self.pos_max else precent_pos
        # 结果是买入多少个单位（股，手，顿，合约）
        return self.read_cash * precent_pos / self.bp * self.deposit_rate

Source File: drought.py From RHEAS with MIT License

6 votes

def calcSeverity(model, varname="soil_moist"):
    """Calculate drought severity from *climatology* table stored in database."""
    db = dbio.connect(model.dbname)
    cur = db.cursor()
    if varname == "soil_moist":
        sql = "select fdate,(ST_DumpValues(st_union(rast,'sum'))).valarray from {0}.soil_moist group by fdate order by fdate".format(model.name)
    else:
        sql = "select fdate,(ST_DumpValues(rast)).valarray from {0}.runoff order by fdate".format(model.name)
    cur.execute(sql)
    results = cur.fetchall()
    data = np.array([np.array(r[1]).ravel() for r in results])
    i = np.where(np.not_equal(data[0, :], None))[0]
    p = pandas.DataFrame(data[:, i], index=np.array([r[0] for r in results], dtype='datetime64'), columns=range(len(i)))
    p = p.rolling('10D').mean()  # calculate percentiles with dekad rolling mean
    st = "{0}-{1}-{2}".format(model.startyear, model.startmonth, model.startday)
    et = "{0}-{1}-{2}".format(model.endyear, model.endmonth, model.endday)
    s = np.array([[stats.percentileofscore(p[pi].values, v) for v in p[pi][st:et]] for pi in p.columns]).T
    s = 100.0 - s
    cur.close()
    db.close()
    return s

Source File: test_aggregator.py From pywr with GNU General Public License v3.0

5 votes

def percentileofscore_with_axis(values, *args, axis=0, **kwargs):
    if values.ndim == 1:
        # For 1D data we just calculate the percentile
        out = percentileofscore(values, *args, **kwargs)
    elif axis == 0:
        # 2D data by axis 0
        out = [percentileofscore(values[:, i], *args, **kwargs) for i in range(values.shape[1])]
    elif axis == 1:
        # 2D data by axis 1
        out = [percentileofscore(values[i, :], *args, **kwargs) for i in range(values.shape[0])]
    else:
        raise ValueError('Axis "{}" not supported'.format(axis))
    return out

Source File: eia860_test.py From pudl with MIT License

5 votes

def test_own_eia860(pudl_out_eia, live_pudl_db):
    """Sanity checks for EIA 860 generator ownership data."""
    if not live_pudl_db:
        raise AssertionError("Data validation only works with a live PUDL DB.")
    logger.info('Reading EIA 860 generator ownership data...')
    own_out = pudl_out_eia.own_eia860()

    if (own_out.fraction_owned > 1.0).any():
        raise AssertionError(
            "Generators with ownership fractions > 1.0 found. Bad data?"
        )

    if (own_out.fraction_owned < 0.0).any():
        raise AssertionError(
            "Generators with ownership fractions < 0.0 found. Bad data?"
        )

    # Verify that the reported ownership fractions add up to something very
    # close to 1.0 (i.e. that the full ownership of each generator is
    # specified by the EIA860 data)
    own_gb = own_out.groupby(['report_date', 'plant_id_eia', 'generator_id'])
    own_sum = own_gb['fraction_owned'].agg(helpers.sum_na).reset_index()
    logger.info(
        f"{len(own_sum[own_sum.fraction_owned.isnull()])} generator-years have no ownership data.")

    own_sum = own_sum.dropna()
    pct_missing = stats.percentileofscore(own_sum.fraction_owned, 0.98)
    logger.info(
        f"{len(own_sum[own_sum.fraction_owned < 0.98])} ({pct_missing}%) "
        f"generator-years have incomplete ownership data.")
    if not max(own_sum['fraction_owned'] < 1.02):
        raise ValueError("Plants with more than 100% ownership found...")
    # There might be a few generators with incomplete ownership but virtually
    # everything should be pretty fully described. If not, let us know. The
    # 0.5 threshold means 0.5% -- i.e. less than 1 in 200 is partial.
    if pct_missing >= 0.5:
        raise ValueError(
            f"{pct_missing}% of generators lack complete ownership data."
        )

Source File: anomaly_detectors.py From datastream.io with Apache License 2.0

5 votes

def score_anomaly(self, x):
        x = pd.Series(x)
        scores = pd.Series([0.01*percentileofscore(self.sample_, z) for z in x])
        return scores

Source File: detector.py From datastream.io with Apache License 2.0

5 votes

def score(self, x):
        from scipy.stats import percentileofscore
        return [0.01*percentileofscore(x, z) for z in x]

Source File: perf.py From tia with BSD 3-Clause "New" or "Revised" License

5 votes

def rolling_percentileofscore(series, window, min_periods=None):
    """Computue the score percentile for the specified window."""
    import scipy.stats as stats

    def _percentile(arr):
        score = arr[-1]
        vals = arr[:-1]
        return stats.percentileofscore(vals, score)

    notnull = series.dropna()
    min_periods = min_periods or window
    if notnull.empty:
        return pd.Series(np.nan, index=series.index)
    else:
        return pd.rolling_apply(notnull, window, _percentile, min_periods=min_periods).reindex(series.index)

Source File: perf.py From tia with BSD 3-Clause "New" or "Revised" License

5 votes

def expanding_percentileofscore(series, min_periods=None):
    import scipy.stats as stats

    def _percentile(arr):
        score = arr[-1]
        vals = arr[:-1]
        return stats.percentileofscore(vals, score)

    notnull = series.dropna()
    if notnull.empty:
        return pd.Series(np.nan, index=series.index)
    else:
        return pd.expanding_apply(notnull, _percentile, min_periods=min_periods).reindex(series.index)

Source File: stats.py From NiMARE with MIT License

5 votes

def null_to_p(test_value, null_array, tail='two'):
    """Return p-value for test value against null array.

    Parameters
    ----------
    test_value : :obj:`float`
        Value for which to determine p-value.
    null_array : 1D :class:`numpy.ndarray`
        Null distribution against which test_value is compared.
    tail : {'two', 'upper', 'lower'}, optional
        Whether to compare value against null distribution in a two-sided
        ('two') or one-sided ('upper' or 'lower') manner.
        If 'upper', then higher values for the test_value are more significant.
        If 'lower', then lower values for the test_value are more significant.
        Default is 'two'.

    Returns
    -------
    p_value : :obj:`float`
        P-value associated with the test value when compared against the null
        distribution.
    """
    if tail == 'two':
        p_value = (50 - np.abs(stats.percentileofscore(
            null_array, test_value) - 50.)) * 2. / 100.
    elif tail == 'upper':
        p_value = 1 - (stats.percentileofscore(null_array, test_value) / 100.)
    elif tail == 'lower':
        p_value = stats.percentileofscore(null_array, test_value) / 100.
    else:
        raise ValueError('Argument "tail" must be one of ["two", "upper", '
                         '"lower"]')
    return p_value

Source File: replay.py From nelpy with MIT License

4 votes

def score_hmm_events_no_xval(bst, training=None, validation=None, num_states=30, n_shuffles=5000, shuffle='row-wise', verbose=False):
    """same as score_hmm_events, but train on training set, and only score validation set..."""
    if shuffle == 'row-wise':
        rowwise = True
    elif shuffle == 'col-wise':
        rowwise = False
    else:
        shuffle = 'timeswap'

    scores_hmm = np.zeros(len(validation))
    scores_hmm_shuffled = np.zeros((len(validation), n_shuffles))

    PBEs_train = bst[training]
    PBEs_test = bst[validation]

    # train HMM on all training PBEs
    hmm = hmmutils.PoissonHMM(n_components=num_states, random_state=0, verbose=False)
    hmm.fit(PBEs_train)

    # reorder states according to transmat ordering
    transmat_order = hmm.get_state_order('transmat')
    hmm.reorder_states(transmat_order)

    # compute scores_hmm (log likelihoods) of validation set:
    scores_hmm[:] = hmm.score(PBEs_test)

    if shuffle == 'timeswap':
        _, scores_tswap_hmm = score_hmm_timeswap_shuffle(bst=PBEs_test,
                                                        hmm=hmm,
                                                        n_shuffles=n_shuffles)

        scores_hmm_shuffled[:,:] = scores_tswap_hmm.T
    else:
        hmm_shuffled = copy.deepcopy(hmm)
        for nn in range(n_shuffles):
            # shuffle transition matrix:
            if rowwise:
                hmm_shuffled.transmat_ = shuffle_transmat(hmm_shuffled.transmat)
            else:
                hmm_shuffled.transmat_ = shuffle_transmat_Kourosh_breaks_stochasticity(hmm_shuffled.transmat)
                hmm_shuffled.transmat_ = hmm_shuffled.transmat / np.tile(hmm_shuffled.transmat.sum(axis=1), (hmm_shuffled.n_components, 1)).T

            # score validation set with shuffled HMM
            scores_hmm_shuffled[:, nn] = hmm_shuffled.score(PBEs_test)

    n_scores = len(scores_hmm)
    scores_hmm_percentile = np.array([stats.percentileofscore(scores_hmm_shuffled[idx], scores_hmm[idx], kind='mean') for idx in range(n_scores)])

    return scores_hmm, scores_hmm_shuffled, scores_hmm_percentile

Source File: statistics.py From gs-quant with Apache License 2.0

4 votes

def percentiles(x: pd.Series, y: pd.Series = None, w: Union[Window, int] = Window(None, 0)) -> pd.Series:
    """
    Rolling percentiles over given window

    :param x: value series
    :param y: distribution series
    :param w: Window or int: size of window and ramp up to use. e.g. Window(22, 10) where 22 is the window size
              and 10 the ramp up value. Window size defaults to length of series.
    :return: timeseries of percentiles

    **Usage**

    Calculate `percentile rank <https://en.wikipedia.org/wiki/Percentile_rank>`_ of :math:`y` in the sample distribution
    of :math:`x` over a rolling window of length :math:`w`:

    :math:`R_t = \\frac{\sum_{i=t-N+1}^{t}{[X_i<{Y_t}]}+0.5\sum_{i=t-N+1}^{t}{[X_i={Y_t}]}}{N}\\times100\%`

    Where :math:`N` is the number of observations in a rolling window. If :math:`y` is not provided, calculates
    percentiles of :math:`x` over its historical values. If window length :math:`w` is not provided, uses an
    ever-growing history of values. If :math:`w` is greater than the available data size, returns empty.

    **Examples**

    Compute percentile ranks of a series in the sample distribution of a second series over :math:`22` observations

    >>> a = generate_series(100)
    >>> b = generate_series(100)
    >>> percentiles(a, b, 22)

    **See also**

    :func:`zscores`

    """
    w = normalize_window(x, w)
    if x.empty:
        return x

    if y is None:
        y = x.copy()

    if isinstance(w.r, int) and w.r > len(y):
        raise ValueError('Ramp value must be less than the length of the series y.')

    if isinstance(w.w, int) and w.w > len(x):
        return pd.Series()

    res = pd.Series(dtype=np.dtype(float))
    for idx, val in y.iteritems():
        sample = x.loc[(x.index > idx - w.w) & (x.index <= idx)] if isinstance(w.w, pd.DateOffset) else x[:idx][-w.w:]
        res.loc[idx] = percentileofscore(sample, val, kind='mean')

    if isinstance(w.r, pd.DateOffset):
        return res.loc[res.index[0] + w.r:]
    else:
        return res[w.r:]

Source File: dataset.py From tropycal with MIT License

4 votes

def season_composite(self,seasons,climo_bounds=None):
        
        r"""
        Create composite statistics for a list of seasons.
        
        Parameters
        ----------
        seasons : list
            List of seasons to create a composite of. For Southern Hemisphere, season is the start of the two-year period.
        climo_bounds : list or tuple
            List or tuple of start and end years of climatology bounds. If none, defaults to (1981,2010).
        
        Returns
        -------
        dict
            Dictionary containing the composite of the requested seasons.
        """
        
        #Error check
        if isinstance(seasons,list) == False:
            raise TypeError("'seasons' must be of type list.")
        
        #Create climo bounds
        if climo_bounds is None:
            climo_bounds = (1981,2010)
        
        #Get Season object for the composite
        summary = self.get_season(seasons).summary()
        
        #Get basin climatology
        climatology = self.climatology(climo_bounds[0],climo_bounds[1])
        full_climo = self.to_dataframe()
        subset_climo = full_climo.loc[climo_bounds[0]:climo_bounds[1]+1]
        
        #Create composite dictionary
        map_keys = {'all_storms':'season_storms',
                    'named_storms':'season_named',
                    'hurricanes':'season_hurricane',
                    'major_hurricanes':'season_major',
                    'ace':'season_ace',
                   }
        composite = {}
        for key in map_keys.keys():
            
            #Get list from seasons
            season_list = summary[map_keys.get(key)]
            
            #Get climatology
            season_climo = climatology[key]
            
            #Get individual years in climatology
            season_fullclimo = subset_climo[key].values
            
            #Create dictionary of relevant calculations for this entry
            composite[key] = {'list':season_list,
                              'average':np.round(np.average(season_list),1),
                              'composite_anomaly':np.round(np.average(season_list)-season_climo,1),
                              'percentile_ranks':[np.round(stats.percentileofscore(season_fullclimo,i),1) for i in season_list],
                             }
        
        return composite

Python scipy.stats.percentileofscore() Examples