Python patsy.dmatrices() Examples

The following are 28 code examples of patsy.dmatrices(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module patsy , or try the search function .
Example #1
Source File: patsy_adaptor.py    From patsylearn with GNU General Public License v2.0 6 votes vote down vote up
def fit(self, data, y=None):
        """Fit the scikit-learn model using the formula.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Contains features and possible labels.
            Column names need to match variables in formula.
        """
        eval_env = EvalEnvironment.capture(self.eval_env, reference=1)
        formula = _drop_intercept(self.formula, self.add_intercept)
        design_y, design_X = dmatrices(formula, data, eval_env=eval_env,
                                       NA_action=self.NA_action)
        self.design_y_ = design_y.design_info
        self.design_X_ = design_X.design_info
        self.feature_names_ = design_X.design_info.column_names
        # convert to 1d vector so we don't get a warning
        # from sklearn.
        design_y = column_or_1d(design_y)
        est = clone(self.estimator)
        self.estimator_ = est.fit(design_X, design_y)
        return self 
Example #2
Source File: patsy_wraps.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def transform_with_patsy(formula, data, *args, **kwargs):
    try:
        # needs patsy v0.5.1 to support formula in Python 3.7
        # https://github.com/pydata/patsy/pull/131
        import patsy
    except ImportError:
        raise ImportError("'patsy' is required to transform with string formula")

    if '~' in formula:
        y, X = patsy.dmatrices(formula, data=data, return_type='dataframe',
                               *args, **kwargs)
        if len(y.shape) > 1 and y.shape[1] != 1:
            raise ValueError('target must be 1 dimensional')
        y = y.iloc[:, 0]
        return data._constructor(X, target=y)
    else:
        X = patsy.dmatrix(formula, data=data, return_type='dataframe',
                          *args, **kwargs)
        return data._constructor(X) 
Example #3
Source File: data.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def __setstate__(self, d):
        if "restore_design_info" in d:
            # NOTE: there may be a more performant way to do this
            from patsy import dmatrices, PatsyError
            exc = []
            try:
                data = d['frame']
            except KeyError:
                data = d['orig_endog'].join(d['orig_exog'])

            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
                try:
                    _, design = dmatrices(d['formula'], data, eval_env=depth,
                                          return_type='dataframe')
                    break
                except (NameError, PatsyError) as e:
                    print('not in depth %d' % depth)
                    exc.append(e)   # why do I need a reference from outside except block
                    pass
            else:
                raise exc[-1]

            self.design_info = design.design_info
            del d["restore_design_info"]
        self.__dict__.update(d) 
Example #4
Source File: data.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def __setstate__(self, d):
        if "restore_design_info" in d:
            # NOTE: there may be a more performant way to do this
            from patsy import dmatrices, PatsyError
            exc = []
            try:
                data = d['frame']
            except KeyError:
                data = d['orig_endog'].join(d['orig_exog'])

            for depth in [2, 3, 1, 0, 4]:  # sequence is a guess where to likely find it
                try:
                    _, design = dmatrices(d['formula'], data, eval_env=depth,
                                          return_type='dataframe')
                    break
                except (NameError, PatsyError) as e:
                    print('not in depth %d' % depth)
                    exc.append(e)   # why do I need a reference from outside except block
                    pass
            else:
                raise exc[-1]

            self.design_info = design.design_info
            del d["restore_design_info"]
        self.__dict__.update(d) 
Example #5
Source File: ordered_logit.py    From estimagic with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def ordered_logit_processing(formula, data):
    """Process user input for an ordered logit model."""
    # extract data arrays
    y, x = dmatrices(formula + " - 1", data, return_type="dataframe")
    y = y[y.columns[0]]

    # extract dimensions
    num_choices = len(y.unique())
    beta_names = list(x.columns)
    num_betas = len(beta_names)
    num_cutoffs = num_choices - 1

    # set-up index for params_df
    names = beta_names + list(range(num_cutoffs))
    categories = ["beta"] * num_betas + ["cutoff"] * num_cutoffs
    index = pd.MultiIndex.from_tuples(zip(categories, names), names=["type", "name"])

    # make params_df
    np.random.seed(5471)
    start_params = pd.DataFrame(index=index)
    start_params["value"] = np.hstack(
        [
            np.random.uniform(low=-0.5, high=0.5, size=len(x.columns)),
            np.arange(num_cutoffs) * 2,
        ]
    )

    # make constraints
    constr = [{"loc": "cutoff", "type": "increasing"}]

    return start_params, y.to_numpy().astype(int), x.to_numpy(), constr 
Example #6
Source File: dynlin.py    From pyflux with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self,formula,data):

        # Initialize TSM object
        super(DynReg,self).__init__('DynReg')

        # Latent variables
        self.max_lag = 0
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.model_name = "Dynamic Linear Regression"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.z_no = self.X.shape[1] + 1
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index

        self._create_latent_variables() 
Example #7
Source File: egarchmreg.py    From pyflux with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, data, p, q, formula):

        # Initialize TSM object
        super(EGARCHMReg,self).__init__('EGARCHMReg')

        # Latent variables
        self.p = p
        self.q = q
        self.max_lag = max(self.p,self.q)  
        self.z_no = self.p + self.q + 2
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False
        self.leverage = False
        self.model_name = "EGARCHMReg(" + str(self.p) + "," + str(self.q) + ")"

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.z_no += self.X.shape[1]*2
        self.y_name = self.y.design_info.describe()
        self.data_name = self.y_name
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.data_length = len(self.data)
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.initial_values = np.zeros(self.z_no)

        self._create_latent_variables() 
Example #8
Source File: utils.py    From pymer4 with MIT License 5 votes vote down vote up
def _chunk_boot_ols_coefs(dat, formula, weights, seed):
    """
    OLS computation of coefficients to be used in a parallelization context.
    """
    # Random sample with replacement from all data
    dat = dat.sample(frac=1, replace=True, random_state=seed)
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    b = _ols(
        x, y, robust=None, n_lags=1, cluster=None, all_stats=False, weights=weights
    )
    return list(b) 
Example #9
Source File: utils.py    From pymer4 with MIT License 5 votes vote down vote up
def _ols_group(dat, formula, group_col, group, rank):
    """Compute OLS on data given a formula. Used by Lm2"""
    dat = dat[dat[group_col] == group].reset_index(drop=True)
    if rank:
        dat = dat.rank()
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    b = _ols(x, y, robust=None, n_lags=1, cluster=None, all_stats=False)
    return list(b) 
Example #10
Source File: smoothers.py    From plotnine with GNU General Public License v2.0 5 votes vote down vote up
def gls_formula(data, xseq, **params):
    """
    Fit GLL using a formula
    """
    eval_env = params['enviroment']
    formula = params['formula']
    init_kwargs, fit_kwargs = separate_method_kwargs(
        params['method_args'], sm.GLS, sm.GLS.fit)
    model = smf.gls(
        formula,
        data,
        eval_env=eval_env,
        **init_kwargs
    )
    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        _, predictors = dmatrices(formula, data, eval_env=eval_env)
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, predictors, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u
    return data 
Example #11
Source File: utils.py    From pymer4 with MIT License 5 votes vote down vote up
def _corr_group(dat, formula, group_col, group, rank, corr_type):
    """Compute partial correlations via OLS. Used by Lm2"""

    from scipy.stats import pearsonr

    dat = dat[dat[group_col] == group].reset_index(drop=True)
    if rank:
        dat = dat.rank()
    y, x = dmatrices(formula, dat, 1, return_type="dataframe")
    corrs = []
    for c in x.columns[1:]:
        other_preds = [e for e in x.columns if e != c]
        other_preds = x[other_preds]
        cc = x[c]
        pred_m_resid = _ols(
            other_preds,
            cc,
            robust=None,
            n_lags=1,
            cluster=None,
            all_stats=False,
            resid_only=True,
        )
        if corr_type == "semi":
            dv_m_resid = y.values.squeeze()
        elif corr_type == "partial":
            dv_m_resid = _ols(
                other_preds,
                y,
                robust=None,
                n_lags=1,
                cluster=None,
                all_stats=False,
                resid_only=True,
            )
        corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0])
    return corrs 
Example #12
Source File: model_recommendation.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    f_test, r = f_regression(X, y, center=True)
    logger.info(f_test)
    logger.info(r)
    return 
Example #13
Source File: model_recommendation.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def recursive_feature_elimination(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    estimator = SVR(kernel='linear')
    selector = RFE(estimator, 5, step=1)
    selector = selector.fit(X, y)
    logger.info(selector.support_)
    logger.info(selector.ranking_)
    return 
Example #14
Source File: pipelines.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def run_logistic_regression(df, patsy_model, dependent_variable, estimator, weights):
    y, X = dmatrices(patsy_model, df, return_type='dataframe')

    model_result = discrete_model.MNLogit(y, X).fit(maxiter=100, disp=False, method="nm")

    p_values = model_result.pvalues[0].to_dict()
    t_values = model_result.tvalues[0].to_dict()
    params = model_result.params[0].to_dict()
    ste = model_result.bse[0].to_dict()

    constants = {
        'p_value': p_values.get('Intercept'),
        't_value': t_values.get('Intercept'),
        'coefficient': params.get('Intercept'),
        'standard_error': ste.get('Intercept')
    }

    regression_field_properties = {
        'p_value': p_values,
        't_value': t_values,
        'coefficient': params,
        'standard_error': ste
    }

    total_regression_properties = {
        'aic': model_result.aic,
        'bic': model_result.bic,
        'r_squared': model_result.prsquared,
        'r_squared_adj': model_result.prsquared,
        'llf': model_result.llf,
        'llnull': model_result.llnull,
        'llr_pvalue': model_result.llr_pvalue
        # 'f_test': model_result.f_test
    }

    regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties)

    return regression_results 
Example #15
Source File: pipelines.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def run_linear_regression(df, patsy_model, dependent_variable, estimator, weights):
    y, X = dmatrices(patsy_model, df, return_type='dataframe')
    model_result = sm.OLS(y, X).fit()

    p_values = model_result.pvalues.to_dict()
    t_values = model_result.tvalues.to_dict()
    params = model_result.params.to_dict()
    ste = model_result.bse.to_dict()
    conf_ints = parse_confidence_intervals(model_result)

    constants = {
        'p_value': p_values.get('Intercept'),
        't_value': t_values.get('Intercept'),
        'coefficient': params.get('Intercept'),
        'standard_error': ste.get('Intercept'),
        'conf_int': conf_ints.get('Intercept')
    }

    regression_field_properties = {
        'p_value': p_values,
        't_value': t_values,
        'coefficient': params,
        'standard_error': ste,
        'conf_int': conf_ints
    }

    total_regression_properties = {
        'aic': model_result.aic,
        'bic': model_result.bic,
        'dof': model_result.nobs,
        'r_squared': model_result.rsquared,
        'r_squared_adj': model_result.rsquared_adj,
        'f_test': model_result.fvalue,
        # 'resid': model_result.resid.tolist()
    }

    regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties)

    return regression_results 
Example #16
Source File: utilities.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def get_design_matrices(df, dependent_variable, independent_variables, interactions=[]):
    patsy_model = create_patsy_model(dependent_variable, independent_variables, interactions=interactions)
    y, X = dmatrices(patsy_model, df, return_type='dataframe')
    return (y, X) 
Example #17
Source File: test_multivariate_ols.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_from_formula_vs_no_formula():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method='svd')
    r0 = r.mv_test()
    endog, exog = patsy.dmatrices(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data, return_type="dataframe")
    L = np.array([[1, 0, 0, 0, 0, 0]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    L = np.array([[0, 1, 0, 0, 0, 0],
                  [0, 0, 1, 0, 0, 0],
                  ])
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6) 
Example #18
Source File: patsy_adaptor.py    From patsylearn with GNU General Public License v2.0 5 votes vote down vote up
def score(self, data):
        """Predict with estimator using formula.

        Transform the data using formula, then predict on it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
            Data needs to contain the label column.
        """
        design_infos = (self.design_y_, self.design_X_)
        design_y, design_X = dmatrices(design_infos, data)
        return self.estimator_.score(design_X, design_y) 
Example #19
Source File: formulatools.py    From Splunking-Crime with GNU Affero General Public License v3.0 4 votes vote down vote up
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info 
Example #20
Source File: Lm.py    From pymer4 with MIT License 4 votes vote down vote up
def to_corrs(self, corr_type="semi", ztrans_corrs=False):
        """
        Transform fitted model coefficients (excluding the intercept) to partial or semi-partial correlations with dependent variable. The is useful for rescaling coefficients to a correlation scale (-1 to 1) and does **not** change how inferences are performed. Semi-partial correlations are computed as the correlation between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor. They are interpretable in the same way as the original coefficients. Partial correlations reflect the unique variance a predictor explains in the DV accounting for correlations between predictors *and* what is not explained by other predictors; this value is always >= the semi-partial correlation. They are *not* interpretable in the same way as the original coefficients. Partial correlations are computed as the correlations between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor *and* the DV. Good ref: https://bit.ly/2GNwXh5 

        Args:
            corr_type (string): 'semi' or 'partial'
            ztrans_partial_corrs (bool): whether to fisher z-transform (arctan) partial correlations before reporting them; default False

        Returns:
            pd.Series: partial or semi-partial correlations

        """

        if not self.fitted:
            raise RuntimeError(
                "Model must be fit before partial correlations can be computed"
            )
        if corr_type not in ["semi", "partial"]:
            raise ValueError("corr_type must be 'semi' or 'partial'")
        from scipy.stats import pearsonr

        corrs = []
        corrs.append(np.nan)  # don't compute for intercept
        for c in self.design_matrix.columns[1:]:
            dv = self.formula.split("~")[0]
            other_preds = [e for e in self.design_matrix.columns[1:] if e != c]
            right_side = "+".join(other_preds)
            y, x = dmatrices(
                c + "~" + right_side, self.data, 1, return_type="dataframe"
            )
            pred_m_resid = _ols(
                x,
                y,
                robust=False,
                n_lags=1,
                cluster=None,
                all_stats=False,
                resid_only=True,
            )
            y, x = dmatrices(
                dv + "~" + right_side, self.data, 1, return_type="dataframe"
            )
            if corr_type == "semi":
                dv_m_resid = y.values.squeeze()
            elif corr_type == "partial":
                dv_m_resid = _ols(
                    x,
                    y,
                    robust=False,
                    n_lags=1,
                    cluster=None,
                    all_stats=False,
                    resid_only=True,
                )
            corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0])
        if ztrans_corrs:
            corrs = np.arctanh(corrs)
        return pd.Series(corrs, index=self.coefs.index) 
Example #21
Source File: nnarx.py    From pyflux with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def __init__(self, data, formula, ar, units, layers, family=fam.Normal()):

        # Initialize TSM object
        super(NNARX, self).__init__('NNARX')

        # Latent Variable information
        self.ar = ar
        self.units = units
        self.layers = layers
        self.activation = np.tanh
        self.model_name = "NNARX(" + str(self.ar) + ")"
        self.z_no = self.ar + 2
        self.max_lag = self.ar
        self._z_hide = 0 # Whether to cutoff latent variables from results table
        self.supported_methods = ["BBVI"]
        self.default_method = "BBVI"
        self.multivariate_model = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data.copy()
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.y_name = self.y.design_info.describe()
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = self.y.astype(np.float) 
        self.X = self.X.astype(np.float) 
        self.z_no = self.X.shape[1]
        self.data_name = self.y_name
        self.y = np.array([self.y]).ravel()
        self.data = self.y.copy()
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.data_length = self.data.shape[0]
        self.X = self.X[self.ar:, :]
        self.X = np.concatenate([self._ar_matrix().T, self.X], axis=1).T

        self._create_latent_variables()

        self.family = family
        
        self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup()
        
        self.model_name = self.model_name2 + " NNARX(" + str(self.ar) + ")"

        # Build any remaining latent variables that are specific to the family chosen
        for no, i in enumerate(self.family.build_latent_variables()):
            self.latent_variables.add_z(i[0], i[1], i[2])
            self.latent_variables.z_list[-1].start = i[3]

        self.z_no = len(self.latent_variables.z_list)
        self.family_z_no = len(self.family.build_latent_variables())

        # Initialize with random weights
        for var_no in range(len(self.latent_variables.z_list)-self.family_z_no):
            self.latent_variables.z_list[var_no].start = np.random.normal()

        if isinstance(self.family, fam.Normal):
            self.neg_loglik = self.normal_neg_loglik
        else:
            self.neg_loglik = self.general_neg_loglik 
Example #22
Source File: gasreg.py    From pyflux with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def __init__(self, formula, data, family):

        # Initialize TSM object     
        super(GASReg,self).__init__('GASReg')

        # Latent Variables
        self.max_lag = 0
        self._z_hide = 0 # Whether to cutoff variance latent variables from results
        self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"]
        self.default_method = "MLE"
        self.multivariate_model = False
        self.skewness = False

        # Format the data
        self.is_pandas = True # This is compulsory for this model type
        self.data_original = data
        self.formula = formula
        self.y, self.X = dmatrices(formula, data)
        self.y_name = self.y.design_info.describe()
        self.X_names = self.X.design_info.describe().split(" + ")
        self.y = self.y.astype(np.float) 
        self.X = self.X.astype(np.float) 
        self.z_no = self.X.shape[1]
        self.data_name = self.y_name
        self.y = np.array([self.y]).ravel()
        self.data = self.y
        self.X = np.array([self.X])[0]
        self.index = data.index
        self.initial_values = np.zeros(self.z_no)

        self.data_length = self.data.shape[0]
        self._create_model_matrices()
        self._create_latent_variables()

        self.family = family
        
        self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup()
    
        # Identify whether model has cythonized backend - then choose update type
        if self.cythonized is True:
            self._model = self._cythonized_model 
            self._mb_model = self._cythonized_mb_model
            self.recursion = self.family.gradientreg_recursion()
        else:
            self._model = self._uncythonized_model
            self._mb_model = self._uncythonized_mb_model

        self.model_name = self.model_name2 + " GAS Regression"

        # Build any remaining latent variables that are specific to the family chosen
        for no, i in enumerate(self.family.build_latent_variables()):
            self.latent_variables.add_z(i[0],i[1],i[2])
            self.latent_variables.z_list[no+self.z_no].start = i[3]

        self.family_z_no = len(self.family.build_latent_variables())
        self.z_no += len(self.family.build_latent_variables()) 
Example #23
Source File: mice.py    From Splunking-Crime with GNU Affero General Public License v3.0 4 votes vote down vote up
def get_fitting_data(self, vname):
        """
        Return the data needed to fit a model for imputation.

        The data is used to impute variable `vname`, and therefore
        only includes cases for which `vname` is observed.

        Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are
        processed through Patsy and subset to align with the model's
        endog and exog.

        Parameters
        ----------
        vname : string
           The variable for which the fitting data is returned.

        Returns
        -------
        endog : DataFrame
            Observed values of `vname`.
        exog : DataFrame
            Regression design matrix for imputing `vname`.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        # Rows with observed endog
        ix = self.ix_obs[vname]

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        endog = np.asarray(endog.iloc[ix, 0])
        exog = np.asarray(exog.iloc[ix, :])

        init_kwds = self._process_kwds(self.init_kwds[vname], ix)
        fit_kwds = self._process_kwds(self.fit_kwds[vname], ix)

        return endog, exog, init_kwds, fit_kwds 
Example #24
Source File: mice.py    From Splunking-Crime with GNU Affero General Public License v3.0 4 votes vote down vote up
def get_split_data(self, vname):
        """
        Return endog and exog for imputation of a given variable.

        Parameters
        ----------
        vname : string
           The variable for which the split data is returned.

        Returns
        -------
        endog_obs : DataFrame
            Observed values of the variable to be imputed.
        exog_obs : DataFrame
            Current values of the predictors where the variable to be
            imputed is observed.
        exog_miss : DataFrame
            Current values of the predictors where the variable to be
            Imputed is missing.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        # Rows with observed endog
        ixo = self.ix_obs[vname]
        endog_obs = np.asarray(endog.iloc[ixo])
        exog_obs = np.asarray(exog.iloc[ixo, :])

        # Rows with missing endog
        ixm = self.ix_miss[vname]
        exog_miss = np.asarray(exog.iloc[ixm, :])

        predict_obs_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_obs_kwds = self._process_kwds(kwds, ixo)

        predict_miss_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_miss_kwds = self._process_kwds(kwds, ixo)

        return endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds 
Example #25
Source File: smoothers.py    From plotnine with GNU General Public License v2.0 4 votes vote down vote up
def lm_formula(data, xseq, **params):
    """
    Fit OLS / WLS using a formula
    """
    formula = params['formula']
    eval_env = params['enviroment']
    weights = data.get('weight', None)

    if weights is None:
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.ols(
            formula,
            data,
            eval_env=eval_env,
            **init_kwargs
        )
    else:
        if np.any(weights < 0):
            raise ValueError(
                "All weights must be greater than zero."
            )
        init_kwargs, fit_kwargs = separate_method_kwargs(
            params['method_args'], sm.OLS, sm.OLS.fit)
        model = smf.wls(
            formula,
            data,
            weights=weights,
            eval_env=eval_env,
            **init_kwargs
        )

    results = model.fit(**fit_kwargs)
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(data)

    if params['se']:
        _, predictors = dmatrices(formula, data, eval_env=eval_env)
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, predictors, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u
    return data 
Example #26
Source File: mice.py    From vnpy_crypto with MIT License 4 votes vote down vote up
def get_fitting_data(self, vname):
        """
        Return the data needed to fit a model for imputation.

        The data is used to impute variable `vname`, and therefore
        only includes cases for which `vname` is observed.

        Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are
        processed through Patsy and subset to align with the model's
        endog and exog.

        Parameters
        ----------
        vname : string
           The variable for which the fitting data is returned.

        Returns
        -------
        endog : DataFrame
            Observed values of `vname`.
        exog : DataFrame
            Regression design matrix for imputing `vname`.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        # Rows with observed endog
        ix = self.ix_obs[vname]

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        endog = np.asarray(endog.iloc[ix, 0])
        exog = np.asarray(exog.iloc[ix, :])

        init_kwds = self._process_kwds(self.init_kwds[vname], ix)
        fit_kwds = self._process_kwds(self.fit_kwds[vname], ix)

        return endog, exog, init_kwds, fit_kwds 
Example #27
Source File: mice.py    From vnpy_crypto with MIT License 4 votes vote down vote up
def get_split_data(self, vname):
        """
        Return endog and exog for imputation of a given variable.

        Parameters
        ----------
        vname : string
           The variable for which the split data is returned.

        Returns
        -------
        endog_obs : DataFrame
            Observed values of the variable to be imputed.
        exog_obs : DataFrame
            Current values of the predictors where the variable to be
            imputed is observed.
        exog_miss : DataFrame
            Current values of the predictors where the variable to be
            Imputed is missing.
        init_kwds : dict-like
            The init keyword arguments for `vname`, processed through Patsy
            as required.
        fit_kwds : dict-like
            The fit keyword arguments for `vname`, processed through Patsy
            as required.
        """

        formula = self.conditional_formula[vname]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")

        # Rows with observed endog
        ixo = self.ix_obs[vname]
        endog_obs = np.asarray(endog.iloc[ixo])
        exog_obs = np.asarray(exog.iloc[ixo, :])

        # Rows with missing endog
        ixm = self.ix_miss[vname]
        exog_miss = np.asarray(exog.iloc[ixm, :])

        predict_obs_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_obs_kwds = self._process_kwds(kwds, ixo)

        predict_miss_kwds = {}
        if vname in self.predict_kwds:
            kwds = self.predict_kwds[vname]
            predict_miss_kwds = self._process_kwds(kwds, ixo)

        return (endog_obs, exog_obs, exog_miss, predict_obs_kwds,
                predict_miss_kwds) 
Example #28
Source File: formulatools.py    From vnpy_crypto with MIT License 4 votes vote down vote up
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info