Python Examples of patsy.dmatrices

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

6 votes

def fit(self, data, y=None):
        """Fit the scikit-learn model using the formula.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Contains features and possible labels.
            Column names need to match variables in formula.
        """
        eval_env = EvalEnvironment.capture(self.eval_env, reference=1)
        formula = _drop_intercept(self.formula, self.add_intercept)
        design_y, design_X = dmatrices(formula, data, eval_env=eval_env,
                                       NA_action=self.NA_action)
        self.design_y_ = design_y.design_info
        self.design_X_ = design_X.design_info
        self.feature_names_ = design_X.design_info.column_names
        # convert to 1d vector so we don't get a warning
        # from sklearn.
        design_y = column_or_1d(design_y)
        est = clone(self.estimator)
        self.estimator_ = est.fit(design_X, design_y)
        return self

Source File: patsy_wraps.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def transform_with_patsy(formula, data, *args, **kwargs):
    try:
        # needs patsy v0.5.1 to support formula in Python 3.7
        # https://github.com/pydata/patsy/pull/131
        import patsy
    except ImportError:
        raise ImportError("'patsy' is required to transform with string formula")

    if '~' in formula:
        y, X = patsy.dmatrices(formula, data=data, return_type='dataframe',
                               *args, **kwargs)
        if len(y.shape) > 1 and y.shape[1] != 1:
            raise ValueError('target must be 1 dimensional')
        y = y.iloc[:, 0]
        return data._constructor(X, target=y)
    else:
        X = patsy.dmatrix(formula, data=data, return_type='dataframe',
                          *args, **kwargs)
        return data._constructor(X)

Source File: data.py From vnpy_crypto with MIT License

6 votes

def __setstate__(self, d):
        if "restore_design_info" in d:
            # NOTE: there may be a more performant way to do this
            from patsy import dmatrices, PatsyError
            exc = []
            try:
                data = d['frame']
            except KeyError:
                data = d['orig_endog'].join(d['orig_exog'])

            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
                try:
                    _, design = dmatrices(d['formula'], data, eval_env=depth,
                                          return_type='dataframe')
                    break
                except (NameError, PatsyError) as e:
                    print('not in depth %d' % depth)
                    exc.append(e)   # why do I need a reference from outside except block
                    pass
            else:
                raise exc[-1]

            self.design_info = design.design_info
            del d["restore_design_info"]
        self.__dict__.update(d)

Source File: data.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def __setstate__(self, d):
        if "restore_design_info" in d:
            # NOTE: there may be a more performant way to do this
            from patsy import dmatrices, PatsyError
            exc = []
            try:
                data = d['frame']
            except KeyError:
                data = d['orig_endog'].join(d['orig_exog'])

            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
                try:
                    _, design = dmatrices(d['formula'], data, eval_env=depth,
                                          return_type='dataframe')
                    break
                except (NameError, PatsyError) as e:
                    print('not in depth %d' % depth)
                    exc.append(e)   # why do I need a reference from outside except block
                    pass
            else:
                raise exc[-1]

            self.design_info = design.design_info
            del d["restore_design_info"]
        self.__dict__.update(d)

Source File: ordered_logit.py From estimagic with BSD 3-Clause "New" or "Revised" License

5 votes

def ordered_logit_processing(formula, data):
    """Process user input for an ordered logit model."""
    # extract data arrays
    y, x = dmatrices(formula + " - 1", data, return_type="dataframe")
    y = y[y.columns[0]]

    # extract dimensions
    num_choices = len(y.unique())
    beta_names = list(x.columns)
    num_betas = len(beta_names)
    num_cutoffs = num_choices - 1

    # set-up index for params_df
    names = beta_names + list(range(num_cutoffs))
    categories = ["beta"] * num_betas + ["cutoff"] * num_cutoffs
    index = pd.MultiIndex.from_tuples(zip(categories, names), names=["type", "name"])

    # make params_df
    np.random.seed(5471)
    start_params = pd.DataFrame(index=index)
    start_params["value"] = np.hstack(
        [
            np.random.uniform(low=-0.5, high=0.5, size=len(x.columns)),
            np.arange(num_cutoffs) * 2,
        ]
    )

    # make constraints
    constr = [{"loc": "cutoff", "type": "increasing"}]

    return start_params, y.to_numpy().astype(int), x.to_numpy(), constr

Source File: dynlin.py From pyflux with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self,formula,data):

        # Initialize TSM object
        super(DynReg,self).__init__('DynReg')

        # Latent variables
        self.max_lag = 0
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.model_name = "Dynamic Linear Regression"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.z_no = self.X.shape[1] + 1
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index

        self._create_latent_variables()

Source File: egarchmreg.py From pyflux with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, data, p, q, formula):

        # Initialize TSM object
        super(EGARCHMReg,self).__init__('EGARCHMReg')

        # Latent variables
        self.p = p
        self.q = q
        self.max_lag = max(self.p,self.q)  
        self.z_no = self.p + self.q + 2
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False
        self.leverage = False
        self.model_name = "EGARCHMReg(" + str(self.p) + "," + str(self.q) + ")"

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.z_no += self.X.shape[1]*2
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.data_length = len(self.data)
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.initial_values = np.zeros(self.z_no)

        self._create_latent_variables()

Source File: utils.py From pymer4 with MIT License

5 votes

def _chunk_boot_ols_coefs(dat, formula, weights, seed):
    """
    OLS computation of coefficients to be used in a parallelization context.
    """
    # Random sample with replacement from all data
    dat = dat.sample(frac=1, replace=True, random_state=seed)
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    b = _ols(
        x, y, robust=None, n_lags=1, cluster=None, all_stats=False, weights=weights
    )
    return list(b)

Source File: utils.py From pymer4 with MIT License

5 votes

def _ols_group(dat, formula, group_col, group, rank):
    """Compute OLS on data given a formula. Used by Lm2"""
    dat = dat[dat[group_col] == group].reset_index(drop=True)
    if rank:
        dat = dat.rank()
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    b = _ols(x, y, robust=None, n_lags=1, cluster=None, all_stats=False)
    return list(b)

Source File: smoothers.py From plotnine with GNU General Public License v2.0

5 votes

def gls_formula(data, xseq, **params):
    """
    Fit GLL using a formula
    """
    eval_env = params['enviroment']
    formula = params['formula']
    init_kwargs, fit_kwargs = separate_method_kwargs(
        params['method_args'], sm.GLS, sm.GLS.fit)
    model = smf.gls(
        formula,
        data,
        eval_env=eval_env,
        **init_kwargs
    )
    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        _, predictors = dmatrices(formula, data, eval_env=eval_env)
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, predictors, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u
    return data

Source File: utils.py From pymer4 with MIT License

5 votes

def _corr_group(dat, formula, group_col, group, rank, corr_type):
    """Compute partial correlations via OLS. Used by Lm2"""

    from scipy.stats import pearsonr

    dat = dat[dat[group_col] == group].reset_index(drop=True)
    if rank:
        dat = dat.rank()
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    corrs = []
    for c in x.columns[1:]:
        other_preds = [e for e in x.columns if e != c]
        other_preds = x[other_preds]
        cc = x[c]
        pred_m_resid = _ols(
            other_preds,
            cc,
            robust=None,
            n_lags=1,
            cluster=None,
            all_stats=False,
            resid_only=True,
        )
        if corr_type == "semi":
            dv_m_resid = y.values.squeeze()
        elif corr_type == "partial":
            dv_m_resid = _ols(
                other_preds,
                y,
                robust=None,
                n_lags=1,
                cluster=None,
                all_stats=False,
                resid_only=True,
            )
        corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0])
    return corrs

Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0

5 votes

def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    f_test, r = f_regression(X, y, center=True)
    logger.info(f_test)
    logger.info(r)
    return

Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0

5 votes

def recursive_feature_elimination(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    estimator = SVR(kernel='linear')
    selector = RFE(estimator, 5, step=1)
    selector = selector.fit(X, y)
    logger.info(selector.support_)
    logger.info(selector.ranking_)
    return

Source File: pipelines.py From DIVE-backend with GNU General Public License v3.0

5 votes

def run_logistic_regression(df, patsy_model, dependent_variable, estimator, weights):
    y, X = dmatrices(patsy_model, df, return_type='dataframe')

    model_result = discrete_model.MNLogit(y, X).fit(maxiter=100, disp=False, method="nm")

    p_values = model_result.pvalues[0].to_dict()
    t_values = model_result.tvalues[0].to_dict()
    params = model_result.params[0].to_dict()
    ste = model_result.bse[0].to_dict()

    constants = {
        'p_value': p_values.get('Intercept'),
        't_value': t_values.get('Intercept'),
        'coefficient': params.get('Intercept'),
        'standard_error': ste.get('Intercept')
    }

    regression_field_properties = {
        'p_value': p_values,
        't_value': t_values,
        'coefficient': params,
        'standard_error': ste
    }

    total_regression_properties = {
        'aic': model_result.aic,
        'bic': model_result.bic,
        'r_squared': model_result.prsquared,
        'r_squared_adj': model_result.prsquared,
        'llf': model_result.llf,
        'llnull': model_result.llnull,
        'llr_pvalue': model_result.llr_pvalue
        # 'f_test': model_result.f_test
    }

    regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties)

    return regression_results

Source File: pipelines.py From DIVE-backend with GNU General Public License v3.0

5 votes

def run_linear_regression(df, patsy_model, dependent_variable, estimator, weights):
    y, X = dmatrices(patsy_model, df, return_type='dataframe')
    model_result = sm.OLS(y, X).fit()

    p_values = model_result.pvalues.to_dict()
    t_values = model_result.tvalues.to_dict()
    params = model_result.params.to_dict()
    ste = model_result.bse.to_dict()
    conf_ints = parse_confidence_intervals(model_result)

    constants = {
        'p_value': p_values.get('Intercept'),
        't_value': t_values.get('Intercept'),
        'coefficient': params.get('Intercept'),
        'standard_error': ste.get('Intercept'),
        'conf_int': conf_ints.get('Intercept')
    }

    regression_field_properties = {
        'p_value': p_values,
        't_value': t_values,
        'coefficient': params,
        'standard_error': ste,
        'conf_int': conf_ints
    }

    total_regression_properties = {
        'aic': model_result.aic,
        'bic': model_result.bic,
        'dof': model_result.nobs,
        'r_squared': model_result.rsquared,
        'r_squared_adj': model_result.rsquared_adj,
        'f_test': model_result.fvalue,
        # 'resid': model_result.resid.tolist()
    }

    regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties)

    return regression_results

Source File: utilities.py From DIVE-backend with GNU General Public License v3.0

5 votes

def get_design_matrices(df, dependent_variable, independent_variables, interactions=[]):
    patsy_model = create_patsy_model(dependent_variable, independent_variables, interactions=interactions)
    y, X = dmatrices(patsy_model, df, return_type='dataframe')
    return (y, X)

Source File: test_multivariate_ols.py From vnpy_crypto with MIT License

5 votes

def test_from_formula_vs_no_formula():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method='svd')
    r0 = r.mv_test()
    endog, exog = patsy.dmatrices(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data, return_type="dataframe")
    L = np.array([[1, 0, 0, 0, 0, 0]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    L = np.array([[0, 1, 0, 0, 0, 0],
                  [0, 0, 1, 0, 0, 0],
                  ])
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6)

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

5 votes

def score(self, data):
        """Predict with estimator using formula.

        Transform the data using formula, then predict on it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
            Data needs to contain the label column.
        """
        design_infos = (self.design_y_, self.design_X_)
        design_y, design_X = dmatrices(design_infos, data)
        return self.estimator_.score(design_X, design_y)

Source File: formulatools.py From Splunking-Crime with GNU Affero General Public License v3.0

4 votes

def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info

Source File: Lm.py From pymer4 with MIT License

4 votes

def to_corrs(self, corr_type="semi", ztrans_corrs=False):
        """
        Transform fitted model coefficients (excluding the intercept) to partial or semi-partial correlations with dependent variable. The is useful for rescaling coefficients to a correlation scale (-1 to 1) and does **not** change how inferences are performed. Semi-partial correlations are computed as the correlation between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor. They are interpretable in the same way as the original coefficients. Partial correlations reflect the unique variance a predictor explains in the DV accounting for correlations between predictors *and* what is not explained by other predictors; this value is always >= the semi-partial correlation. They are *not* interpretable in the same way as the original coefficients. Partial correlations are computed as the correlations between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor *and* the DV. Good ref: https://bit.ly/2GNwXh5 

        Args:
            corr_type (string): 'semi' or 'partial'
            ztrans_partial_corrs (bool): whether to fisher z-transform (arctan) partial correlations before reporting them; default False

        Returns:
            pd.Series: partial or semi-partial correlations

        """

        if not self.fitted:
            raise RuntimeError(
                "Model must be fit before partial correlations can be computed"
            )
        if corr_type not in ["semi", "partial"]:
            raise ValueError("corr_type must be 'semi' or 'partial'")
        from scipy.stats import pearsonr

        corrs = []
        corrs.append(np.nan)  # don't compute for intercept
        for c in self.design_matrix.columns[1:]:
            dv = self.formula.split("~")[0]
            other_preds = [e for e in self.design_matrix.columns[1:] if e != c]
            right_side = "+".join(other_preds)
            y, x = dmatrices(
                c + "~" + right_side, self.data, 1, return_type="dataframe"
            )
            pred_m_resid = _ols(
                x,
                y,
                robust=False,
                n_lags=1,
                cluster=None,
                all_stats=False,
                resid_only=True,
            )
            y, x = dmatrices(
                dv + "~" + right_side, self.data, 1, return_type="dataframe"
            )
            if corr_type == "semi":
                dv_m_resid = y.values.squeeze()
            elif corr_type == "partial":
                dv_m_resid = _ols(
                    x,
                    y,
                    robust=False,
                    n_lags=1,
                    cluster=None,
                    all_stats=False,
                    resid_only=True,
                )
            corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0])
        if ztrans_corrs:
            corrs = np.arctanh(corrs)
        return pd.Series(corrs, index=self.coefs.index)

Source File: nnarx.py From pyflux with BSD 3-Clause "New" or "Revised" License

4 votes

def __init__(self, data, formula, ar, units, layers, family=fam.Normal()):

        # Initialize TSM object
        super(NNARX, self).__init__('NNARX')

        # Latent Variable information
        self.ar = ar
        self.units = units
        self.layers = layers
        self.activation = np.tanh
        self.model_name = "NNARX(" + str(self.ar) + ")"
        self.z_no = self.ar + 2
        self.max_lag = self.ar
        self._z_hide = 0 # Whether to cutoff latent variables from results table
        self.supported_methods = ["BBVI"]
        self.default_method = "BBVI"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data.copy()
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.y_name = self.y.design_info.describe()
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = self.y.astype(np.float) 
        self.X = self.X.astype(np.float) 
        self.z_no = self.X.shape[1]
        self.data_name = self.y_name
        self.y = np.array([self.y]).ravel()
        self.data = self.y.copy()
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.data_length = self.data.shape[0]
        self.X = self.X[self.ar:, :]
        self.X = np.concatenate([self._ar_matrix().T, self.X], axis=1).T

        self._create_latent_variables()

        self.family = family
        
        self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup()
        
        self.model_name = self.model_name2 + " NNARX(" + str(self.ar) + ")"

        # Build any remaining latent variables that are specific to the family chosen
        for no, i in enumerate(self.family.build_latent_variables()):
            self.latent_variables.add_z(i[0], i[1], i[2])
            self.latent_variables.z_list[-1].start = i[3]

        self.z_no = len(self.latent_variables.z_list)
        self.family_z_no = len(self.family.build_latent_variables())

        # Initialize with random weights
        for var_no in range(len(self.latent_variables.z_list)-self.family_z_no):
            self.latent_variables.z_list[var_no].start = np.random.normal()

        if isinstance(self.family, fam.Normal):
            self.neg_loglik = self.normal_neg_loglik
        else:
            self.neg_loglik = self.general_neg_loglik

Source File: gasreg.py From pyflux with BSD 3-Clause "New" or "Revised" License

4 votes

def __init__(self, formula, data, family):

        # Initialize TSM object     
        super(GASReg,self).__init__('GASReg')

        # Latent Variables
        self.max_lag = 0
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False
        self.skewness = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.y_name = self.y.design_info.describe()
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = self.y.astype(np.float) 
        self.X = self.X.astype(np.float) 
        self.z_no = self.X.shape[1]
        self.data_name = self.y_name
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.initial_values = np.zeros(self.z_no)

        self.data_length = self.data.shape[0]
        self._create_model_matrices()
        self._create_latent_variables()

        self.family = family
        
        self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup()
    
        # Identify whether model has cythonized backend - then choose update type
        if self.cythonized is True:
            self._model = self._cythonized_model 
            self._mb_model = self._cythonized_mb_model
            self.recursion = self.family.gradientreg_recursion()
        else:
            self._model = self._uncythonized_model
            self._mb_model = self._uncythonized_mb_model

        self.model_name = self.model_name2 + " GAS Regression"

        # Build any remaining latent variables that are specific to the family chosen
        for no, i in enumerate(self.family.build_latent_variables()):
            self.latent_variables.add_z(i[0],i[1],i[2])
            self.latent_variables.z_list[no+self.z_no].start = i[3]

        self.family_z_no = len(self.family.build_latent_variables())
        self.z_no += len(self.family.build_latent_variables())

Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0

4 votes

def get_fitting_data(self, vname):
        """
        Return the data needed to fit a model for imputation.

        The data is used to impute variable `vname`, and therefore
        only includes cases for which `vname` is observed.

        Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are
        processed through Patsy and subset to align with the model's
        endog and exog.

        Parameters
        ----------
        vname : string
           The variable for which the fitting data is returned.

        Returns
        -------
        endog : DataFrame
            Observed values of `vname`.
        exog : DataFrame
            Regression design matrix for imputing `vname`.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        # Rows with observed endog
        ix = self.ix_obs[vname]

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        endog = np.asarray(endog.iloc[ix, 0])
        exog = np.asarray(exog.iloc[ix, :])

        init_kwds = self._process_kwds(self.init_kwds[vname], ix)
        fit_kwds = self._process_kwds(self.fit_kwds[vname], ix)

        return endog, exog, init_kwds, fit_kwds

Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0

4 votes

def get_split_data(self, vname):
        """
        Return endog and exog for imputation of a given variable.

        Parameters
        ----------
        vname : string
           The variable for which the split data is returned.

        Returns
        -------
        endog_obs : DataFrame
            Observed values of the variable to be imputed.
        exog_obs : DataFrame
            Current values of the predictors where the variable to be
            imputed is observed.
        exog_miss : DataFrame
            Current values of the predictors where the variable to be
            Imputed is missing.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        # Rows with observed endog
        ixo = self.ix_obs[vname]
        endog_obs = np.asarray(endog.iloc[ixo])
        exog_obs = np.asarray(exog.iloc[ixo, :])

        # Rows with missing endog
        ixm = self.ix_miss[vname]
        exog_miss = np.asarray(exog.iloc[ixm, :])

        predict_obs_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_obs_kwds = self._process_kwds(kwds, ixo)

        predict_miss_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_miss_kwds = self._process_kwds(kwds, ixo)

        return endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds

Source File: smoothers.py From plotnine with GNU General Public License v2.0

4 votes

def lm_formula(data, xseq, **params):
    """
    Fit OLS / WLS using a formula
    """
    formula = params['formula']
    eval_env = params['enviroment']
    weights = data.get('weight', None)

    if weights is None:
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.ols(
            formula,
            data,
            eval_env=eval_env,
            **init_kwargs
        )
    else:
        if np.any(weights < 0):
            raise ValueError(
                "All weights must be greater than zero."
            )
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.wls(
            formula,
            data,
            weights=weights,
            eval_env=eval_env,
            **init_kwargs
        )

    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        _, predictors = dmatrices(formula, data, eval_env=eval_env)
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, predictors, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u
    return data

Source File: mice.py From vnpy_crypto with MIT License

4 votes

def get_fitting_data(self, vname):
        """
        Return the data needed to fit a model for imputation.

        The data is used to impute variable `vname`, and therefore
        only includes cases for which `vname` is observed.

        Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are
        processed through Patsy and subset to align with the model's
        endog and exog.

        Parameters
        ----------
        vname : string
           The variable for which the fitting data is returned.

        Returns
        -------
        endog : DataFrame
            Observed values of `vname`.
        exog : DataFrame
            Regression design matrix for imputing `vname`.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        # Rows with observed endog
        ix = self.ix_obs[vname]

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        endog = np.asarray(endog.iloc[ix, 0])
        exog = np.asarray(exog.iloc[ix, :])

        init_kwds = self._process_kwds(self.init_kwds[vname], ix)
        fit_kwds = self._process_kwds(self.fit_kwds[vname], ix)

        return endog, exog, init_kwds, fit_kwds

Source File: mice.py From vnpy_crypto with MIT License

4 votes

def get_split_data(self, vname):
        """
        Return endog and exog for imputation of a given variable.

        Parameters
        ----------
        vname : string
           The variable for which the split data is returned.

        Returns
        -------
        endog_obs : DataFrame
            Observed values of the variable to be imputed.
        exog_obs : DataFrame
            Current values of the predictors where the variable to be
            imputed is observed.
        exog_miss : DataFrame
            Current values of the predictors where the variable to be
            Imputed is missing.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        # Rows with observed endog
        ixo = self.ix_obs[vname]
        endog_obs = np.asarray(endog.iloc[ixo])
        exog_obs = np.asarray(exog.iloc[ixo, :])

        # Rows with missing endog
        ixm = self.ix_miss[vname]
        exog_miss = np.asarray(exog.iloc[ixm, :])

        predict_obs_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_obs_kwds = self._process_kwds(kwds, ixo)

        predict_miss_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_miss_kwds = self._process_kwds(kwds, ixo)

        return (endog_obs, exog_obs, exog_miss, predict_obs_kwds,
                predict_miss_kwds)

Source File: formulatools.py From vnpy_crypto with MIT License

4 votes

def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info

Python patsy.dmatrices() Examples