Python patsy.dmatrices() Examples
The following are 28
code examples of patsy.dmatrices().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
patsy
, or try the search function
.
Example #1
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 6 votes |
def fit(self, data, y=None): """Fit the scikit-learn model using the formula. Parameters ---------- data : dict-like (pandas dataframe) Input data. Contains features and possible labels. Column names need to match variables in formula. """ eval_env = EvalEnvironment.capture(self.eval_env, reference=1) formula = _drop_intercept(self.formula, self.add_intercept) design_y, design_X = dmatrices(formula, data, eval_env=eval_env, NA_action=self.NA_action) self.design_y_ = design_y.design_info self.design_X_ = design_X.design_info self.feature_names_ = design_X.design_info.column_names # convert to 1d vector so we don't get a warning # from sklearn. design_y = column_or_1d(design_y) est = clone(self.estimator) self.estimator_ = est.fit(design_X, design_y) return self
Example #2
Source File: patsy_wraps.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def transform_with_patsy(formula, data, *args, **kwargs): try: # needs patsy v0.5.1 to support formula in Python 3.7 # https://github.com/pydata/patsy/pull/131 import patsy except ImportError: raise ImportError("'patsy' is required to transform with string formula") if '~' in formula: y, X = patsy.dmatrices(formula, data=data, return_type='dataframe', *args, **kwargs) if len(y.shape) > 1 and y.shape[1] != 1: raise ValueError('target must be 1 dimensional') y = y.iloc[:, 0] return data._constructor(X, target=y) else: X = patsy.dmatrix(formula, data=data, return_type='dataframe', *args, **kwargs) return data._constructor(X)
Example #3
Source File: data.py From vnpy_crypto with MIT License | 6 votes |
def __setstate__(self, d): if "restore_design_info" in d: # NOTE: there may be a more performant way to do this from patsy import dmatrices, PatsyError exc = [] try: data = d['frame'] except KeyError: data = d['orig_endog'].join(d['orig_exog']) for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it try: _, design = dmatrices(d['formula'], data, eval_env=depth, return_type='dataframe') break except (NameError, PatsyError) as e: print('not in depth %d' % depth) exc.append(e) # why do I need a reference from outside except block pass else: raise exc[-1] self.design_info = design.design_info del d["restore_design_info"] self.__dict__.update(d)
Example #4
Source File: data.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def __setstate__(self, d): if "restore_design_info" in d: # NOTE: there may be a more performant way to do this from patsy import dmatrices, PatsyError exc = [] try: data = d['frame'] except KeyError: data = d['orig_endog'].join(d['orig_exog']) for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it try: _, design = dmatrices(d['formula'], data, eval_env=depth, return_type='dataframe') break except (NameError, PatsyError) as e: print('not in depth %d' % depth) exc.append(e) # why do I need a reference from outside except block pass else: raise exc[-1] self.design_info = design.design_info del d["restore_design_info"] self.__dict__.update(d)
Example #5
Source File: ordered_logit.py From estimagic with BSD 3-Clause "New" or "Revised" License | 5 votes |
def ordered_logit_processing(formula, data): """Process user input for an ordered logit model.""" # extract data arrays y, x = dmatrices(formula + " - 1", data, return_type="dataframe") y = y[y.columns[0]] # extract dimensions num_choices = len(y.unique()) beta_names = list(x.columns) num_betas = len(beta_names) num_cutoffs = num_choices - 1 # set-up index for params_df names = beta_names + list(range(num_cutoffs)) categories = ["beta"] * num_betas + ["cutoff"] * num_cutoffs index = pd.MultiIndex.from_tuples(zip(categories, names), names=["type", "name"]) # make params_df np.random.seed(5471) start_params = pd.DataFrame(index=index) start_params["value"] = np.hstack( [ np.random.uniform(low=-0.5, high=0.5, size=len(x.columns)), np.arange(num_cutoffs) * 2, ] ) # make constraints constr = [{"loc": "cutoff", "type": "increasing"}] return start_params, y.to_numpy().astype(int), x.to_numpy(), constr
Example #6
Source File: dynlin.py From pyflux with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self,formula,data): # Initialize TSM object super(DynReg,self).__init__('DynReg') # Latent variables self.max_lag = 0 self._z_hide = 0 # Whether to cutoff variance latent variables from results self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"] self.default_method = "MLE" self.model_name = "Dynamic Linear Regression" self.multivariate_model = False # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data self.formula = formula self.y, self.X = dmatrices(formula, data) self.z_no = self.X.shape[1] + 1 self.y_name = self.y.design_info.describe() self.data_name = self.y_name self.X_names = self.X.design_info.describe().split(" + ") self.y = np.array([self.y]).ravel() self.data = self.y self.X = np.array([self.X])[0] self.index = data.index self._create_latent_variables()
Example #7
Source File: egarchmreg.py From pyflux with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, data, p, q, formula): # Initialize TSM object super(EGARCHMReg,self).__init__('EGARCHMReg') # Latent variables self.p = p self.q = q self.max_lag = max(self.p,self.q) self.z_no = self.p + self.q + 2 self._z_hide = 0 # Whether to cutoff variance latent variables from results self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"] self.default_method = "MLE" self.multivariate_model = False self.leverage = False self.model_name = "EGARCHMReg(" + str(self.p) + "," + str(self.q) + ")" # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data self.formula = formula self.y, self.X = dmatrices(formula, data) self.z_no += self.X.shape[1]*2 self.y_name = self.y.design_info.describe() self.data_name = self.y_name self.X_names = self.X.design_info.describe().split(" + ") self.y = np.array([self.y]).ravel() self.data = self.y self.data_length = len(self.data) self.X = np.array([self.X])[0] self.index = data.index self.initial_values = np.zeros(self.z_no) self._create_latent_variables()
Example #8
Source File: utils.py From pymer4 with MIT License | 5 votes |
def _chunk_boot_ols_coefs(dat, formula, weights, seed): """ OLS computation of coefficients to be used in a parallelization context. """ # Random sample with replacement from all data dat = dat.sample(frac=1, replace=True, random_state=seed) y, x = dmatrices(formula, dat, 1, return_type="dataframe") b = _ols( x, y, robust=None, n_lags=1, cluster=None, all_stats=False, weights=weights ) return list(b)
Example #9
Source File: utils.py From pymer4 with MIT License | 5 votes |
def _ols_group(dat, formula, group_col, group, rank): """Compute OLS on data given a formula. Used by Lm2""" dat = dat[dat[group_col] == group].reset_index(drop=True) if rank: dat = dat.rank() y, x = dmatrices(formula, dat, 1, return_type="dataframe") b = _ols(x, y, robust=None, n_lags=1, cluster=None, all_stats=False) return list(b)
Example #10
Source File: smoothers.py From plotnine with GNU General Public License v2.0 | 5 votes |
def gls_formula(data, xseq, **params): """ Fit GLL using a formula """ eval_env = params['enviroment'] formula = params['formula'] init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.GLS, sm.GLS.fit) model = smf.gls( formula, data, eval_env=eval_env, **init_kwargs ) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(data) if params['se']: _, predictors = dmatrices(formula, data, eval_env=eval_env) alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std( results, predictors, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
Example #11
Source File: utils.py From pymer4 with MIT License | 5 votes |
def _corr_group(dat, formula, group_col, group, rank, corr_type): """Compute partial correlations via OLS. Used by Lm2""" from scipy.stats import pearsonr dat = dat[dat[group_col] == group].reset_index(drop=True) if rank: dat = dat.rank() y, x = dmatrices(formula, dat, 1, return_type="dataframe") corrs = [] for c in x.columns[1:]: other_preds = [e for e in x.columns if e != c] other_preds = x[other_preds] cc = x[c] pred_m_resid = _ols( other_preds, cc, robust=None, n_lags=1, cluster=None, all_stats=False, resid_only=True, ) if corr_type == "semi": dv_m_resid = y.values.squeeze() elif corr_type == "partial": dv_m_resid = _ols( other_preds, y, robust=None, n_lags=1, cluster=None, all_stats=False, resid_only=True, ) corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0]) return corrs
Example #12
Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5): considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value) y, X = dmatrices(patsy_models[0], df, return_type='dataframe') f_test, r = f_regression(X, y, center=True) logger.info(f_test) logger.info(r) return
Example #13
Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def recursive_feature_elimination(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5): considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value) y, X = dmatrices(patsy_models[0], df, return_type='dataframe') estimator = SVR(kernel='linear') selector = RFE(estimator, 5, step=1) selector = selector.fit(X, y) logger.info(selector.support_) logger.info(selector.ranking_) return
Example #14
Source File: pipelines.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def run_logistic_regression(df, patsy_model, dependent_variable, estimator, weights): y, X = dmatrices(patsy_model, df, return_type='dataframe') model_result = discrete_model.MNLogit(y, X).fit(maxiter=100, disp=False, method="nm") p_values = model_result.pvalues[0].to_dict() t_values = model_result.tvalues[0].to_dict() params = model_result.params[0].to_dict() ste = model_result.bse[0].to_dict() constants = { 'p_value': p_values.get('Intercept'), 't_value': t_values.get('Intercept'), 'coefficient': params.get('Intercept'), 'standard_error': ste.get('Intercept') } regression_field_properties = { 'p_value': p_values, 't_value': t_values, 'coefficient': params, 'standard_error': ste } total_regression_properties = { 'aic': model_result.aic, 'bic': model_result.bic, 'r_squared': model_result.prsquared, 'r_squared_adj': model_result.prsquared, 'llf': model_result.llf, 'llnull': model_result.llnull, 'llr_pvalue': model_result.llr_pvalue # 'f_test': model_result.f_test } regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties) return regression_results
Example #15
Source File: pipelines.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def run_linear_regression(df, patsy_model, dependent_variable, estimator, weights): y, X = dmatrices(patsy_model, df, return_type='dataframe') model_result = sm.OLS(y, X).fit() p_values = model_result.pvalues.to_dict() t_values = model_result.tvalues.to_dict() params = model_result.params.to_dict() ste = model_result.bse.to_dict() conf_ints = parse_confidence_intervals(model_result) constants = { 'p_value': p_values.get('Intercept'), 't_value': t_values.get('Intercept'), 'coefficient': params.get('Intercept'), 'standard_error': ste.get('Intercept'), 'conf_int': conf_ints.get('Intercept') } regression_field_properties = { 'p_value': p_values, 't_value': t_values, 'coefficient': params, 'standard_error': ste, 'conf_int': conf_ints } total_regression_properties = { 'aic': model_result.aic, 'bic': model_result.bic, 'dof': model_result.nobs, 'r_squared': model_result.rsquared, 'r_squared_adj': model_result.rsquared_adj, 'f_test': model_result.fvalue, # 'resid': model_result.resid.tolist() } regression_results = restructure_field_properties_dict(constants, regression_field_properties, total_regression_properties) return regression_results
Example #16
Source File: utilities.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def get_design_matrices(df, dependent_variable, independent_variables, interactions=[]): patsy_model = create_patsy_model(dependent_variable, independent_variables, interactions=interactions) y, X = dmatrices(patsy_model, df, return_type='dataframe') return (y, X)
Example #17
Source File: test_multivariate_ols.py From vnpy_crypto with MIT License | 5 votes |
def test_from_formula_vs_no_formula(): mod = _MultivariateOLS.from_formula( 'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted', data) r = mod.fit(method='svd') r0 = r.mv_test() endog, exog = patsy.dmatrices( 'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted', data, return_type="dataframe") L = np.array([[1, 0, 0, 0, 0, 0]]) # DataFrame input r = _MultivariateOLS(endog, exog).fit(method='svd') r1 = r.mv_test(hypotheses=[['Intercept', L, None]]) assert_array_almost_equal(r1['Intercept']['stat'].values, r0['Intercept']['stat'].values, decimal=6) # Numpy array input r = _MultivariateOLS(endog.values, exog.values).fit(method='svd') r1 = r.mv_test(hypotheses=[['Intercept', L, None]]) assert_array_almost_equal(r1['Intercept']['stat'].values, r0['Intercept']['stat'].values, decimal=6) L = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], ]) r1 = r.mv_test(hypotheses=[['Drug', L, None]]) # DataFrame input r = _MultivariateOLS(endog, exog).fit(method='svd') r1 = r.mv_test(hypotheses=[['Drug', L, None]]) assert_array_almost_equal(r1['Drug']['stat'].values, r0['Drug']['stat'].values, decimal=6) # Numpy array input r = _MultivariateOLS(endog.values, exog.values).fit(method='svd') r1 = r.mv_test(hypotheses=[['Drug', L, None]]) assert_array_almost_equal(r1['Drug']['stat'].values, r0['Drug']['stat'].values, decimal=6)
Example #18
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 5 votes |
def score(self, data): """Predict with estimator using formula. Transform the data using formula, then predict on it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. Data needs to contain the label column. """ design_infos = (self.design_y_, self.design_X_) design_y, design_X = dmatrices(design_infos, data) return self.estimator_.score(design_X, design_y)
Example #19
Source File: formulatools.py From Splunking-Crime with GNU Affero General Public License v3.0 | 4 votes |
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info
Example #20
Source File: Lm.py From pymer4 with MIT License | 4 votes |
def to_corrs(self, corr_type="semi", ztrans_corrs=False): """ Transform fitted model coefficients (excluding the intercept) to partial or semi-partial correlations with dependent variable. The is useful for rescaling coefficients to a correlation scale (-1 to 1) and does **not** change how inferences are performed. Semi-partial correlations are computed as the correlation between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor. They are interpretable in the same way as the original coefficients. Partial correlations reflect the unique variance a predictor explains in the DV accounting for correlations between predictors *and* what is not explained by other predictors; this value is always >= the semi-partial correlation. They are *not* interpretable in the same way as the original coefficients. Partial correlations are computed as the correlations between a DV and each predictor *after* the influence of all other predictors have been regressed out from that predictor *and* the DV. Good ref: https://bit.ly/2GNwXh5 Args: corr_type (string): 'semi' or 'partial' ztrans_partial_corrs (bool): whether to fisher z-transform (arctan) partial correlations before reporting them; default False Returns: pd.Series: partial or semi-partial correlations """ if not self.fitted: raise RuntimeError( "Model must be fit before partial correlations can be computed" ) if corr_type not in ["semi", "partial"]: raise ValueError("corr_type must be 'semi' or 'partial'") from scipy.stats import pearsonr corrs = [] corrs.append(np.nan) # don't compute for intercept for c in self.design_matrix.columns[1:]: dv = self.formula.split("~")[0] other_preds = [e for e in self.design_matrix.columns[1:] if e != c] right_side = "+".join(other_preds) y, x = dmatrices( c + "~" + right_side, self.data, 1, return_type="dataframe" ) pred_m_resid = _ols( x, y, robust=False, n_lags=1, cluster=None, all_stats=False, resid_only=True, ) y, x = dmatrices( dv + "~" + right_side, self.data, 1, return_type="dataframe" ) if corr_type == "semi": dv_m_resid = y.values.squeeze() elif corr_type == "partial": dv_m_resid = _ols( x, y, robust=False, n_lags=1, cluster=None, all_stats=False, resid_only=True, ) corrs.append(pearsonr(dv_m_resid, pred_m_resid)[0]) if ztrans_corrs: corrs = np.arctanh(corrs) return pd.Series(corrs, index=self.coefs.index)
Example #21
Source File: nnarx.py From pyflux with BSD 3-Clause "New" or "Revised" License | 4 votes |
def __init__(self, data, formula, ar, units, layers, family=fam.Normal()): # Initialize TSM object super(NNARX, self).__init__('NNARX') # Latent Variable information self.ar = ar self.units = units self.layers = layers self.activation = np.tanh self.model_name = "NNARX(" + str(self.ar) + ")" self.z_no = self.ar + 2 self.max_lag = self.ar self._z_hide = 0 # Whether to cutoff latent variables from results table self.supported_methods = ["BBVI"] self.default_method = "BBVI" self.multivariate_model = False # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data.copy() self.formula = formula self.y, self.X = dmatrices(formula, data) self.y_name = self.y.design_info.describe() self.X_names = self.X.design_info.describe().split(" + ") self.y = self.y.astype(np.float) self.X = self.X.astype(np.float) self.z_no = self.X.shape[1] self.data_name = self.y_name self.y = np.array([self.y]).ravel() self.data = self.y.copy() self.X = np.array([self.X])[0] self.index = data.index self.data_length = self.data.shape[0] self.X = self.X[self.ar:, :] self.X = np.concatenate([self._ar_matrix().T, self.X], axis=1).T self._create_latent_variables() self.family = family self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup() self.model_name = self.model_name2 + " NNARX(" + str(self.ar) + ")" # Build any remaining latent variables that are specific to the family chosen for no, i in enumerate(self.family.build_latent_variables()): self.latent_variables.add_z(i[0], i[1], i[2]) self.latent_variables.z_list[-1].start = i[3] self.z_no = len(self.latent_variables.z_list) self.family_z_no = len(self.family.build_latent_variables()) # Initialize with random weights for var_no in range(len(self.latent_variables.z_list)-self.family_z_no): self.latent_variables.z_list[var_no].start = np.random.normal() if isinstance(self.family, fam.Normal): self.neg_loglik = self.normal_neg_loglik else: self.neg_loglik = self.general_neg_loglik
Example #22
Source File: gasreg.py From pyflux with BSD 3-Clause "New" or "Revised" License | 4 votes |
def __init__(self, formula, data, family): # Initialize TSM object super(GASReg,self).__init__('GASReg') # Latent Variables self.max_lag = 0 self._z_hide = 0 # Whether to cutoff variance latent variables from results self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"] self.default_method = "MLE" self.multivariate_model = False self.skewness = False # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data self.formula = formula self.y, self.X = dmatrices(formula, data) self.y_name = self.y.design_info.describe() self.X_names = self.X.design_info.describe().split(" + ") self.y = self.y.astype(np.float) self.X = self.X.astype(np.float) self.z_no = self.X.shape[1] self.data_name = self.y_name self.y = np.array([self.y]).ravel() self.data = self.y self.X = np.array([self.X])[0] self.index = data.index self.initial_values = np.zeros(self.z_no) self.data_length = self.data.shape[0] self._create_model_matrices() self._create_latent_variables() self.family = family self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup() # Identify whether model has cythonized backend - then choose update type if self.cythonized is True: self._model = self._cythonized_model self._mb_model = self._cythonized_mb_model self.recursion = self.family.gradientreg_recursion() else: self._model = self._uncythonized_model self._mb_model = self._uncythonized_mb_model self.model_name = self.model_name2 + " GAS Regression" # Build any remaining latent variables that are specific to the family chosen for no, i in enumerate(self.family.build_latent_variables()): self.latent_variables.add_z(i[0],i[1],i[2]) self.latent_variables.z_list[no+self.z_no].start = i[3] self.family_z_no = len(self.family.build_latent_variables()) self.z_no += len(self.family.build_latent_variables())
Example #23
Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0 | 4 votes |
def get_fitting_data(self, vname): """ Return the data needed to fit a model for imputation. The data is used to impute variable `vname`, and therefore only includes cases for which `vname` is observed. Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are processed through Patsy and subset to align with the model's endog and exog. Parameters ---------- vname : string The variable for which the fitting data is returned. Returns ------- endog : DataFrame Observed values of `vname`. exog : DataFrame Regression design matrix for imputing `vname`. init_kwds : dict-like The init keyword arguments for `vname`, processed through Patsy as required. fit_kwds : dict-like The fit keyword arguments for `vname`, processed through Patsy as required. """ # Rows with observed endog ix = self.ix_obs[vname] formula = self.conditional_formula[vname] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") endog = np.asarray(endog.iloc[ix, 0]) exog = np.asarray(exog.iloc[ix, :]) init_kwds = self._process_kwds(self.init_kwds[vname], ix) fit_kwds = self._process_kwds(self.fit_kwds[vname], ix) return endog, exog, init_kwds, fit_kwds
Example #24
Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0 | 4 votes |
def get_split_data(self, vname): """ Return endog and exog for imputation of a given variable. Parameters ---------- vname : string The variable for which the split data is returned. Returns ------- endog_obs : DataFrame Observed values of the variable to be imputed. exog_obs : DataFrame Current values of the predictors where the variable to be imputed is observed. exog_miss : DataFrame Current values of the predictors where the variable to be Imputed is missing. init_kwds : dict-like The init keyword arguments for `vname`, processed through Patsy as required. fit_kwds : dict-like The fit keyword arguments for `vname`, processed through Patsy as required. """ formula = self.conditional_formula[vname] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") # Rows with observed endog ixo = self.ix_obs[vname] endog_obs = np.asarray(endog.iloc[ixo]) exog_obs = np.asarray(exog.iloc[ixo, :]) # Rows with missing endog ixm = self.ix_miss[vname] exog_miss = np.asarray(exog.iloc[ixm, :]) predict_obs_kwds = {} if vname in self.predict_kwds: kwds = self.predict_kwds[vname] predict_obs_kwds = self._process_kwds(kwds, ixo) predict_miss_kwds = {} if vname in self.predict_kwds: kwds = self.predict_kwds[vname] predict_miss_kwds = self._process_kwds(kwds, ixo) return endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds
Example #25
Source File: smoothers.py From plotnine with GNU General Public License v2.0 | 4 votes |
def lm_formula(data, xseq, **params): """ Fit OLS / WLS using a formula """ formula = params['formula'] eval_env = params['enviroment'] weights = data.get('weight', None) if weights is None: init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.OLS, sm.OLS.fit) model = smf.ols( formula, data, eval_env=eval_env, **init_kwargs ) else: if np.any(weights < 0): raise ValueError( "All weights must be greater than zero." ) init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.OLS, sm.OLS.fit) model = smf.wls( formula, data, weights=weights, eval_env=eval_env, **init_kwargs ) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(data) if params['se']: _, predictors = dmatrices(formula, data, eval_env=eval_env) alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std( results, predictors, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
Example #26
Source File: mice.py From vnpy_crypto with MIT License | 4 votes |
def get_fitting_data(self, vname): """ Return the data needed to fit a model for imputation. The data is used to impute variable `vname`, and therefore only includes cases for which `vname` is observed. Values of type `PatsyFormula` in `init_kwds` or `fit_kwds` are processed through Patsy and subset to align with the model's endog and exog. Parameters ---------- vname : string The variable for which the fitting data is returned. Returns ------- endog : DataFrame Observed values of `vname`. exog : DataFrame Regression design matrix for imputing `vname`. init_kwds : dict-like The init keyword arguments for `vname`, processed through Patsy as required. fit_kwds : dict-like The fit keyword arguments for `vname`, processed through Patsy as required. """ # Rows with observed endog ix = self.ix_obs[vname] formula = self.conditional_formula[vname] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") endog = np.asarray(endog.iloc[ix, 0]) exog = np.asarray(exog.iloc[ix, :]) init_kwds = self._process_kwds(self.init_kwds[vname], ix) fit_kwds = self._process_kwds(self.fit_kwds[vname], ix) return endog, exog, init_kwds, fit_kwds
Example #27
Source File: mice.py From vnpy_crypto with MIT License | 4 votes |
def get_split_data(self, vname): """ Return endog and exog for imputation of a given variable. Parameters ---------- vname : string The variable for which the split data is returned. Returns ------- endog_obs : DataFrame Observed values of the variable to be imputed. exog_obs : DataFrame Current values of the predictors where the variable to be imputed is observed. exog_miss : DataFrame Current values of the predictors where the variable to be Imputed is missing. init_kwds : dict-like The init keyword arguments for `vname`, processed through Patsy as required. fit_kwds : dict-like The fit keyword arguments for `vname`, processed through Patsy as required. """ formula = self.conditional_formula[vname] endog, exog = patsy.dmatrices(formula, self.data, return_type="dataframe") # Rows with observed endog ixo = self.ix_obs[vname] endog_obs = np.asarray(endog.iloc[ixo]) exog_obs = np.asarray(exog.iloc[ixo, :]) # Rows with missing endog ixm = self.ix_miss[vname] exog_miss = np.asarray(exog.iloc[ixm, :]) predict_obs_kwds = {} if vname in self.predict_kwds: kwds = self.predict_kwds[vname] predict_obs_kwds = self._process_kwds(kwds, ixo) predict_miss_kwds = {} if vname in self.predict_kwds: kwds = self.predict_kwds[vname] predict_miss_kwds = self._process_kwds(kwds, ixo) return (endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds)
Example #28
Source File: formulatools.py From vnpy_crypto with MIT License | 4 votes |
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info