Python patsy.dmatrix() Examples
The following are 30
code examples of patsy.dmatrix().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
patsy
, or try the search function
.
Example #1
Source File: munge.py From pybids with MIT License | 6 votes |
def _transform(self, var, by): if not isinstance(var, SimpleVariable): self._densify_variables() # Set up all the splitting variables as a DF. Note that variables in # 'by' can be either regular variables, or entities in the index--so # we need to check both places. all_variables = self._variables by_variables = [all_variables[v].values if v in all_variables else var.index[v].reset_index(drop=True) for v in listify(by)] group_data = pd.concat(by_variables, axis=1, sort=True) group_data.columns = listify(by) # Use patsy to create splitting design matrix group_data = group_data.astype(str) formula = '0+' + ':'.join(listify(by)) dm = dmatrix(formula, data=group_data, return_type='dataframe') dm.columns = [col.replace(':', '.') for col in dm.columns] return var.split(dm)
Example #2
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 6 votes |
def transform(self, data): """Transform with estimator using formula. Transform the data using formula, then transform it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. """ if self.return_type == 'dataframe': X = dmatrix(self.design_X_, data, return_type='dataframe') else: X = np.array(dmatrix(self.design_X_, data)) return self.estimator_.transform(X)
Example #3
Source File: P1_utils.py From Benchmarks with MIT License | 6 votes |
def design_mat(mod, numerical_covariates, batch_levels): # require levels to make sure they are in the same order as we use in the # rest of the script. design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), mod, return_type="dataframe") mod = mod.drop(["batch"], axis=1) numerical_covariates = list(numerical_covariates) sys.stderr.write("found %i batches\n" % design.shape[1]) other_cols = [c for i, c in enumerate(mod.columns) if not i in numerical_covariates] factor_matrix = mod[other_cols] design = pd.concat((design, factor_matrix), axis=1) if numerical_covariates is not None: sys.stderr.write("found %i numerical covariates...\n" % len(numerical_covariates)) for i, nC in enumerate(numerical_covariates): cname = mod.columns[nC] sys.stderr.write("\t{0}\n".format(cname)) design[cname] = mod[mod.columns[nC]] sys.stderr.write("found %i categorical variables:" % len(other_cols)) sys.stderr.write("\t" + ", ".join(other_cols) + '\n') return design
Example #4
Source File: TimeVary.py From zEpid with MIT License | 6 votes |
def __predict__(df, model, variable, formula): """UNUSED FUNCTION New hidden predict method to shorten Monte Carlo estimation code. This is slower than statsmodels predict, but likely can be optimized. I would need to find a faster alternative to np.dot()... For this to work, I need to do the following: -need to have each ..._model() store a self.formula -switch all functions to sm.GLM or sm.GEE """ import patsy from zepid.calc import odds_to_probability xdata = patsy.dmatrix(formula, df) # , return_type='dataframe') # pred = xdata.mul(np.array(model.params), axis='columns').sum(axis=1) pred = xdata.dot(model.params) # TODO optimize this... if variable == 'binary': pred = np.random.binomial(1, odds_to_probability(np.exp(pred)), size=xdata.shape[0]) elif variable == 'continuous': pred = np.random.normal(loc=pred, scale=np.std(model.resid), size=len(pp)) # TODO add optimization for multinomial (if applicable) else: raise ValueError('That option is not supported') return pred
Example #5
Source File: patsy_wraps.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def transform_with_patsy(formula, data, *args, **kwargs): try: # needs patsy v0.5.1 to support formula in Python 3.7 # https://github.com/pydata/patsy/pull/131 import patsy except ImportError: raise ImportError("'patsy' is required to transform with string formula") if '~' in formula: y, X = patsy.dmatrices(formula, data=data, return_type='dataframe', *args, **kwargs) if len(y.shape) > 1 and y.shape[1] != 1: raise ValueError('target must be 1 dimensional') y = y.iloc[:, 0] return data._constructor(X, target=y) else: X = patsy.dmatrix(formula, data=data, return_type='dataframe', *args, **kwargs) return data._constructor(X)
Example #6
Source File: combat.py From altanalyze with Apache License 2.0 | 6 votes |
def design_mat(mod, numerical_covariates, batch_levels): # require levels to make sure they are in the same order as we use in the # rest of the script. design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), mod, return_type="dataframe") mod = mod.drop(["batch"], axis=1) numerical_covariates = list(numerical_covariates) sys.stderr.write("found %i batches\n" % design.shape[1]) other_cols = [c for i, c in enumerate(mod.columns) if not i in numerical_covariates] factor_matrix = mod[other_cols] design = pd.concat((design, factor_matrix), axis=1) if numerical_covariates is not None: sys.stderr.write("found %i numerical covariates...\n" % len(numerical_covariates)) for i, nC in enumerate(numerical_covariates): cname = mod.columns[nC] sys.stderr.write("\t{0}\n".format(cname)) design[cname] = mod[mod.columns[nC]] sys.stderr.write("found %i categorical variables:" % len(other_cols)) sys.stderr.write("\t" + ", ".join(other_cols) + '\n') return design
Example #7
Source File: segmentation.py From eemeter with Apache License 2.0 | 5 votes |
def predict(self, data): """ A function which takes input data and predicts for this segment model. """ if self.formula is None: var_str = "" else: var_str = self.formula.split("~", 1)[1] design_matrix_granular = dmatrix(var_str, data, return_type="dataframe") parameters = pd.Series(self.model_params) # Step 1, slice col_type = "C(hour_of_week)" hour_of_week_cols = [ c for c in design_matrix_granular.columns if col_type in c and c in parameters.keys() ] # Step 2, cut out all 0s design_matrix_granular = design_matrix_granular[ (design_matrix_granular[hour_of_week_cols] != 0).any(axis=1) ] cols_to_predict = list( set(parameters.keys()).intersection(set(design_matrix_granular.keys())) ) design_matrix_granular = design_matrix_granular[cols_to_predict] parameters = parameters[cols_to_predict] # Step 3, predict prediction = design_matrix_granular.dot(parameters).rename( columns={0: "predicted_usage"} ) # Step 4, put nans back in prediction = prediction.reindex(data.index) return prediction
Example #8
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 5 votes |
def _fit_transform(self, data, y=None): eval_env = EvalEnvironment.capture(self.eval_env, reference=2) formula = _drop_intercept(self.formula, self.add_intercept) design = dmatrix(formula, data, eval_env=eval_env, NA_action=self.NA_action, return_type='dataframe') self.design_ = design.design_info if self.return_type == 'dataframe': return design else: return np.array(design) self.feature_names_ = design.design_info.column_names return np.array(design)
Example #9
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 5 votes |
def decision_function(self, data): """Compute decision_function of estimator using formula. Transform the data using formula, then predict on it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. """ X = np.array(dmatrix(self.design_X_, data)) return self.estimator_.decision_function(X)
Example #10
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 5 votes |
def predict_proba(self, data): """Compute predict_proba with estimator using formula. Transform the data using formula, then predict probabilities on it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. """ X = np.array(dmatrix(self.design_X_, data)) return self.estimator_.predict_proba(X)
Example #11
Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0 | 5 votes |
def predict(self, data): """Predict with estimator using formula. Transform the data using formula, then predict on it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. """ X = np.array(dmatrix(self.design_X_, data)) return self.estimator_.predict(X)
Example #12
Source File: test_mnl_urbansim.py From choicemodels with BSD 3-Clause "New" or "Revised" License | 5 votes |
def dm(df, test_data): return dmatrix(test_data['formula'], data=df, return_type='dataframe')
Example #13
Source File: test_mnl_new.py From choicemodels with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_mnl_prediction(obs, alts): """ Confirm that fitted probabilities in the new codebase match urbansim.urbanchoice. Only runs if the urbansim package has been installed. """ try: from urbansim.urbanchoice.mnl import mnl_simulate except: print("Comparison of MNL simulation results skipped because urbansim is not installed") return # produce a fitted model mct = MergedChoiceTable(obs, alts, 'choice', 5) m = MultinomialLogit(mct, model_expression='obsval + altval - 1') results = m.fit() # get predicted probabilities using choicemodels probs1 = results.probabilities(mct) # compare to probabilities from urbansim.urbanchoice dm = dmatrix(results.model_expression, data=mct.to_frame(), return_type='dataframe') probs = mnl_simulate(data=dm, coeff=results.fitted_parameters, numalts=mct.sample_size, returnprobs=True) df = mct.to_frame() df['prob'] = probs.flatten() probs2 = df.prob pd.testing.assert_series_equal(probs1, probs2)
Example #14
Source File: test_mnl_new.py From choicemodels with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_mnl_estimation(obs, alts): """ Confirm that estimated params from the new interface match urbansim.urbanchoice. Only runs if the urbansim package has been installed. """ try: from urbansim.urbanchoice.mnl import mnl_estimate except: print("Comparison of MNL estimation results skipped because urbansim is not installed") return model_expression = 'obsval + altval - 1' mct = MergedChoiceTable(obs, alts, 'choice') # new interface m = MultinomialLogit(mct, model_expression) r = m.fit().get_raw_results() # old interface dm = dmatrix(model_expression, mct.to_frame()) chosen = np.reshape(mct.to_frame()[mct.choice_col].values, (100, 5)) log_lik, fit = mnl_estimate(np.array(dm), chosen, numalts=5) for k,v in log_lik.items(): assert(v == pytest.approx(r['log_likelihood'][k], 0.00001)) assert_frame_equal(fit, r['fit_parameters'][['Coefficient', 'Std. Error', 'T-Score']])
Example #15
Source File: combat.py From altanalyze with Apache License 2.0 | 5 votes |
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print 'Running Combat...', expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir(expr_input_dir)+'Non-Combat/'+export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print 'Moved original expression file to:' print '\t'+moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile(expr_input_dir,removeExtra='#') ### remove comments from the original file except Exception: None pheno = pd.read_table(pheno_dir, index_col=0) dat = pd.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() #print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod) print "...Combat completed in %.2f seconds" % (time.time() - t) print 'Original expression file over-written with batch effect removal results...' ebat.to_csv(expr_input_dir, sep="\t")
Example #16
Source File: test_regression_transforms.py From yatsm with MIT License | 5 votes |
def test_harmonic_transform(): x = np.arange(735688, 735688 + 100, 1) design = patsy.dmatrix('0 + harm(x, 1)') truth = np.vstack((np.cos(2 * np.pi / 365.25 * x), np.sin(2 * np.pi / 365.25 * x))).T np.testing.assert_equal(np.asarray(design), truth)
Example #17
Source File: glm.py From py-glm with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _make_rhs_matrix(self, X): formula_parts = self.formula.split('~') if len(formula_parts) == 2: rhs_formula = formula_parts[1].strip() elif len(formula_parts) == 1: rhs_formula = formula_parts.strip() else: raise ValueError( f"Cannot parse model formula {self.formula} to determine right hand side!") X = pt.dmatrix(rhs_formula, X) return X
Example #18
Source File: markers.py From dynamo-release with BSD 3-Clause "New" or "Revised" License | 5 votes |
def diff_test_helper(data, fullModelFormulaStr="~cr(time, df=3)", reducedModelFormulaStr="~1", ): # Dividing data into train and validation datasets transformed_x = dmatrix(fullModelFormulaStr, data, return_type='dataframe') transformed_x_null = dmatrix(reducedModelFormulaStr, data, return_type='dataframe') expression = data['expression'] poisson_training_results = sm.GLM(expression, transformed_x, family=sm.families.Poisson()).fit() poisson_df = pd.DataFrame({'mu': poisson_training_results.mu, 'expression': expression}) poisson_df['AUX_OLS_DEP'] = poisson_df.apply(lambda x: ((x['expression'] - x['mu']) ** 2 - x['expression']) / x['mu'], axis=1) ols_expr = """AUX_OLS_DEP ~ mu - 1""" aux_olsr_results = smf.ols(ols_expr, poisson_df).fit() nb2_family = sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0]) try: nb2_full = sm.GLM(expression, transformed_x, family=nb2_family).fit() nb2_null = sm.GLM(expression, transformed_x_null, family=nb2_family).fit() except: return ('fail', 'NB2', 1) pval = lrt(nb2_full, nb2_null) return ('ok', 'NB2', pval)
Example #19
Source File: patsytransformer.py From scikit-lego with MIT License | 5 votes |
def fit(self, X, y=None): """Fits the estimator""" X_ = dmatrix(self.formula, X, NA_action="raise", return_type=self.return_type) # check the number of observations hasn't changed. This ought not to # be necessary given NA_action='raise' above but just to be safe assert np.array(X_).shape[0] == np.array(X).shape[0] self.design_info_ = X_.design_info return self
Example #20
Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def _process_kwds(self, kwds, ix): kwds = kwds.copy() for k in kwds: v = kwds[k] if isinstance(v, PatsyFormula): mat = patsy.dmatrix(v.formula, self.data, return_type="dataframe") mat = np.asarray(mat)[ix, :] if mat.shape[1] == 1: mat = mat[:, 0] kwds[k] = mat return kwds
Example #21
Source File: test_data.py From vnpy_crypto with MIT License | 5 votes |
def test_patsy_577(): X = np.random.random((10, 2)) df = pandas.DataFrame(X, columns=["var1", "var2"]) from patsy import dmatrix endog = dmatrix("var1 - 1", df) np.testing.assert_(data._is_using_patsy(endog, None)) exog = dmatrix("var2 - 1", df) np.testing.assert_(data._is_using_patsy(endog, exog))
Example #22
Source File: test_mediation.py From vnpy_crypto with MIT License | 5 votes |
def test_framing_example_moderator(): # moderation without formulas, generally not useful but test anyway cur_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv")) outcome = np.asarray(data["cong_mesg"]) outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data, return_type='dataframe') probit = sm.families.links.probit outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit())) mediator = np.asarray(data["emo"]) mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data, return_type='dataframe') mediator_model = sm.OLS(mediator, mediator_exog) tx_pos = [outcome_exog.columns.tolist().index("treat"), mediator_exog.columns.tolist().index("treat")] med_pos = outcome_exog.columns.tolist().index("emo") ix = (outcome_exog.columns.tolist().index("age"), mediator_exog.columns.tolist().index("age")) moderators = {ix : 20} med = Mediation(outcome_model, mediator_model, tx_pos, med_pos, moderators=moderators) # Just a smoke test np.random.seed(4231) med_rslt = med.fit(method='parametric', n_rep=100)
Example #23
Source File: test_mediation.py From vnpy_crypto with MIT License | 5 votes |
def test_framing_example(): cur_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv")) outcome = np.asarray(data["cong_mesg"]) outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data, return_type='dataframe') probit = sm.families.links.probit outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit())) mediator = np.asarray(data["emo"]) mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data, return_type='dataframe') mediator_model = sm.OLS(mediator, mediator_exog) tx_pos = [outcome_exog.columns.tolist().index("treat"), mediator_exog.columns.tolist().index("treat")] med_pos = outcome_exog.columns.tolist().index("emo") med = Mediation(outcome_model, mediator_model, tx_pos, med_pos, outcome_fit_kwargs={'atol':1e-11}) np.random.seed(4231) para_rslt = med.fit(method='parametric', n_rep=100) diff = np.asarray(para_rslt.summary() - framing_para_4231) assert_allclose(diff, 0, atol=1e-6) np.random.seed(4231) boot_rslt = med.fit(method='boot', n_rep=100) diff = np.asarray(boot_rslt.summary() - framing_boot_4231) assert_allclose(diff, 0, atol=1e-6)
Example #24
Source File: test_phreg.py From vnpy_crypto with MIT License | 5 votes |
def test_predict_formula(self): n = 100 np.random.seed(34234) time = 50 * np.random.uniform(size=n) status = np.random.randint(0, 2, n).astype(np.float64) exog = np.random.uniform(1, 2, size=(n, 2)) df = pd.DataFrame({"time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1]}) # Works with "0 +" on RHS but issues warning fml = "time ~ exog1 + np.log(exog2) + exog1*exog2" model1 = PHReg.from_formula(fml, df, status=status) result1 = model1.fit() from patsy import dmatrix dfp = dmatrix(model1.data.design_info.builder, df) pr1 = result1.predict() pr2 = result1.predict(exog=df) pr3 = model1.predict(result1.params, exog=dfp) # No standard errors pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp) prl = (pr1, pr2, pr3, pr4) for i in range(4): for j in range(i): assert_allclose(prl[i].predicted_values, prl[j].predicted_values) prl = (pr1, pr2, pr4) for i in range(3): for j in range(i): assert_allclose(prl[i].standard_errors, prl[j].standard_errors)
Example #25
Source File: yatsm.py From yatsm with MIT License | 5 votes |
def setup(self, df, **config): """ Setup model for input dataset and (optionally) return design matrix Args: df (pandas.DataFrame): Pandas dataframe containing dataset attributes (e.g., dates, image ID, path/row, metadata, etc.) config (dict): YATSM configuration dictionary from user, including 'dataset' and 'YATSM' sub-configurations Returns: numpy.ndarray or None: return design matrix if used by algorithm """ X = patsy.dmatrix(config['YATSM']['design_matrix'], data=df) return X
Example #26
Source File: mnl.py From choicemodels with BSD 3-Clause "New" or "Revised" License | 4 votes |
def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model(data = self._df, obs_id_col = self._observation_id_col, alt_id_col = self._alternative_id_col, choice_col = self._choice_col, specification = self._model_expression, names = self._model_labels, model_type = 'MNL') m.fit_mle(init_vals = self._initial_coefs) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = m) elif (self._estimation_engine == 'ChoiceModels'): dm = dmatrix(self._model_expression, data=self._df) chosen = np.reshape(self._df[[self._choice_col]].values, (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts) result_params = dict(log_likelihood = log_lik, fit_parameters = fit, x_names = dm.design_info.column_names) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = result_params) return results
Example #27
Source File: mnl.py From choicemodels with BSD 3-Clause "New" or "Revised" License | 4 votes |
def probabilities(self, data): """ Generate predicted probabilities for a table of choice scenarios, using the fitted parameters stored in the results object. Parameters ---------- data : choicemodels.tools.MergedChoiceTable Long-format table of choice scenarios. TO DO - accept other data formats. Expected class parameters ------------------------- self.model_expression : patsy string self.fitted_parameters : list of floats Returns ------- pandas.Series with indexes matching the input """ # TO DO - make sure this handles pylogit case # TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario # are consecutive? seems like a requirement here; should document it df = data.to_frame() numalts = data.sample_size # TO DO - make this an official MCT param dm = dmatrix(self.model_expression, data=df) # utility is sum of data values times fitted betas u = np.dot(self.fitted_parameters, np.transpose(dm)) # reshape so axis 0 lists alternatives and axis 1 lists choosers u = np.reshape(u, (numalts, u.size // numalts), order='F') # scale the utilities to make exponentiation easier # https://stats.stackexchange.com/questions/304758/softmax-overflow u = u - u.max(axis=0) exponentiated_utility = np.exp(u) sum_exponentiated_utility = np.sum(exponentiated_utility, axis=0) probs = exponentiated_utility / sum_exponentiated_utility # convert back to ordering of the input data probs = probs.flatten(order='F') df['prob'] = probs # adds indexes return df.prob
Example #28
Source File: TMLE.py From zEpid with MIT License | 4 votes |
def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self.exposure + ' ~ ' + model self.__mweight = model # Step 3) Estimation of g-model (exposure model) if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.g1W = fitmodel.predict(self.df) # User-specified prediction model else: # TODO need to create smart warning system # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with " # "certain machine learning algorithms") self._exp_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) self.g1W = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]), ml_model=custom_model, print_results=print_results) self.g0W = 1 - self.g1W if bound: # Bounding predicted probabilities if requested self.g1W = _bounding_(self.g1W, bounds=bound) self.g0W = _bounding_(self.g0W, bounds=bound) self._fit_exposure_model = True
Example #29
Source File: _combat.py From scanpy with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _design_matrix( model: pd.DataFrame, batch_key: str, batch_levels: Collection[str], ) -> pd.DataFrame: """\ Computes a simple design matrix. Parameters -------- model Contains the batch annotation batch_key Name of the batch column batch_levels Levels of the batch annotation Returns -------- The design matrix for the regression problem """ import patsy design = patsy.dmatrix( "~ 0 + C(Q('{}'), levels=batch_levels)".format(batch_key), model, return_type="dataframe", ) model = model.drop([batch_key], axis=1) numerical_covariates = model.select_dtypes('number').columns.values logg.info(f"Found {design.shape[1]} batches\n") other_cols = [c for c in model.columns.values if c not in numerical_covariates] if other_cols: col_repr = " + ".join("Q('{}')".format(x) for x in other_cols) factor_matrix = patsy.dmatrix( "~ 0 + {}".format(col_repr), model[other_cols], return_type="dataframe" ) design = pd.concat((design, factor_matrix), axis=1) logg.info(f"Found {len(other_cols)} categorical variables:") logg.info("\t" + ", ".join(other_cols) + '\n') if numerical_covariates is not None: logg.info(f"Found {len(numerical_covariates)} numerical variables:") logg.info("\t" + ", ".join(numerical_covariates) + '\n') for nC in numerical_covariates: design[nC] = model[nC] return design
Example #30
Source File: bayes_mixed_glm.py From vnpy_crypto with MIT License | 4 votes |
def from_formula(cls, formula, vc_formulas, data, family=None, vcp_p=1, fe_p=2): """ Fit a BayesMixedGLM using a formula. Parameters ---------- formula : string Formula for the endog and fixed effects terms (use ~ to separate dependent and independent expressions). vc_formulas : dictionary vc_formulas[name] is a one-sided formula that creates one collection of random effects with a common variance prameter. If using a categorical expression to produce variance components, note that generally `0 + ...` should be used so that an intercept is not included. data : data frame The data to which the formulas are applied. family : genmod.families instance A GLM family. vcp_p : float The prior standard deviation for the logarithms of the standard deviations of the random effects. fe_p : float The prior standard deviation for the fixed effects parameters. """ ident = [] exog_vc = [] vcp_names = [] j = 0 for na, fml in vc_formulas.items(): mat = patsy.dmatrix(fml, data, return_type='dataframe') exog_vc.append(mat) vcp_names.append(na) ident.append(j * np.ones(mat.shape[1])) j += 1 exog_vc = pd.concat(exog_vc, axis=1) vc_names = exog_vc.columns.tolist() ident = np.concatenate(ident) model = super(_BayesMixedGLM, cls).from_formula( formula, data=data, family=family, subset=None, exog_vc=exog_vc, ident=ident, vc_names=vc_names, vcp_names=vcp_names, fe_p=fe_p, vcp_p=vcp_p) return model