Python Examples of patsy.dmatrix

Source File: munge.py From pybids with MIT License

6 votes

def _transform(self, var, by):

        if not isinstance(var, SimpleVariable):
            self._densify_variables()

        # Set up all the splitting variables as a DF. Note that variables in
        # 'by' can be either regular variables, or entities in the index--so
        # we need to check both places.
        all_variables = self._variables
        by_variables = [all_variables[v].values if v in all_variables
                        else var.index[v].reset_index(drop=True)
                        for v in listify(by)]
        group_data = pd.concat(by_variables, axis=1, sort=True)
        group_data.columns = listify(by)

        # Use patsy to create splitting design matrix
        group_data = group_data.astype(str)
        formula = '0+' + ':'.join(listify(by))
        dm = dmatrix(formula, data=group_data, return_type='dataframe')
        dm.columns = [col.replace(':', '.') for col in dm.columns]

        return var.split(dm)

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

6 votes

def transform(self, data):
        """Transform with estimator using formula.

        Transform the data using formula, then transform it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
        """

        if self.return_type == 'dataframe':
            X = dmatrix(self.design_X_, data, return_type='dataframe')
        else:
            X = np.array(dmatrix(self.design_X_, data))

        return self.estimator_.transform(X)

Source File: P1_utils.py From Benchmarks with MIT License

6 votes

def design_mat(mod, numerical_covariates, batch_levels):
    # require levels to make sure they are in the same order as we use in the
    # rest of the script.
    design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels),
                                                  mod, return_type="dataframe")

    mod = mod.drop(["batch"], axis=1)
    numerical_covariates = list(numerical_covariates)
    sys.stderr.write("found %i batches\n" % design.shape[1])
    other_cols = [c for i, c in enumerate(mod.columns)
                  if not i in numerical_covariates]
    factor_matrix = mod[other_cols]
    design = pd.concat((design, factor_matrix), axis=1)
    if numerical_covariates is not None:
        sys.stderr.write("found %i numerical covariates...\n"
                            % len(numerical_covariates))
        for i, nC in enumerate(numerical_covariates):
            cname = mod.columns[nC]
            sys.stderr.write("\t{0}\n".format(cname))
            design[cname] = mod[mod.columns[nC]]
    sys.stderr.write("found %i categorical variables:" % len(other_cols))
    sys.stderr.write("\t" + ", ".join(other_cols) + '\n')
    return design

Source File: TimeVary.py From zEpid with MIT License

6 votes

def __predict__(df, model, variable, formula):
        """UNUSED FUNCTION

        New hidden predict method to shorten Monte Carlo estimation code. This is slower than statsmodels predict,
        but likely can be optimized. I would need to find a faster alternative to np.dot()...

        For this to work, I need to do the following:
            -need to have each ..._model() store a self.formula
            -switch all functions to sm.GLM or sm.GEE
        """
        import patsy
        from zepid.calc import odds_to_probability

        xdata = patsy.dmatrix(formula, df)  # , return_type='dataframe')
        # pred = xdata.mul(np.array(model.params), axis='columns').sum(axis=1)
        pred = xdata.dot(model.params)  # TODO optimize this...

        if variable == 'binary':
            pred = np.random.binomial(1, odds_to_probability(np.exp(pred)), size=xdata.shape[0])
        elif variable == 'continuous':
            pred = np.random.normal(loc=pred, scale=np.std(model.resid), size=len(pp))
        # TODO add optimization for multinomial (if applicable)
        else:
            raise ValueError('That option is not supported')
        return pred

Source File: patsy_wraps.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def transform_with_patsy(formula, data, *args, **kwargs):
    try:
        # needs patsy v0.5.1 to support formula in Python 3.7
        # https://github.com/pydata/patsy/pull/131
        import patsy
    except ImportError:
        raise ImportError("'patsy' is required to transform with string formula")

    if '~' in formula:
        y, X = patsy.dmatrices(formula, data=data, return_type='dataframe',
                               *args, **kwargs)
        if len(y.shape) > 1 and y.shape[1] != 1:
            raise ValueError('target must be 1 dimensional')
        y = y.iloc[:, 0]
        return data._constructor(X, target=y)
    else:
        X = patsy.dmatrix(formula, data=data, return_type='dataframe',
                          *args, **kwargs)
        return data._constructor(X)

Source File: combat.py From altanalyze with Apache License 2.0

6 votes

def design_mat(mod, numerical_covariates, batch_levels):
    # require levels to make sure they are in the same order as we use in the
    # rest of the script.
    design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels),
                                                  mod, return_type="dataframe")

    mod = mod.drop(["batch"], axis=1)
    numerical_covariates = list(numerical_covariates)
    sys.stderr.write("found %i batches\n" % design.shape[1])
    other_cols = [c for i, c in enumerate(mod.columns)
                  if not i in numerical_covariates]
    factor_matrix = mod[other_cols]
    design = pd.concat((design, factor_matrix), axis=1)
    if numerical_covariates is not None:
        sys.stderr.write("found %i numerical covariates...\n"
                            % len(numerical_covariates))
        for i, nC in enumerate(numerical_covariates):
            cname = mod.columns[nC]
            sys.stderr.write("\t{0}\n".format(cname))
            design[cname] = mod[mod.columns[nC]]
    sys.stderr.write("found %i categorical variables:" % len(other_cols))
    sys.stderr.write("\t" + ", ".join(other_cols) + '\n')
    return design

Source File: segmentation.py From eemeter with Apache License 2.0

5 votes

def predict(self, data):
        """ A function which takes input data and predicts for this segment model.
        """
        if self.formula is None:
            var_str = ""
        else:
            var_str = self.formula.split("~", 1)[1]

        design_matrix_granular = dmatrix(var_str, data, return_type="dataframe")
        parameters = pd.Series(self.model_params)

        # Step 1, slice
        col_type = "C(hour_of_week)"
        hour_of_week_cols = [
            c
            for c in design_matrix_granular.columns
            if col_type in c and c in parameters.keys()
        ]

        # Step 2, cut out all 0s
        design_matrix_granular = design_matrix_granular[
            (design_matrix_granular[hour_of_week_cols] != 0).any(axis=1)
        ]

        cols_to_predict = list(
            set(parameters.keys()).intersection(set(design_matrix_granular.keys()))
        )
        design_matrix_granular = design_matrix_granular[cols_to_predict]
        parameters = parameters[cols_to_predict]

        # Step 3, predict
        prediction = design_matrix_granular.dot(parameters).rename(
            columns={0: "predicted_usage"}
        )
        # Step 4, put nans back in
        prediction = prediction.reindex(data.index)

        return prediction

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

5 votes

def _fit_transform(self, data, y=None):
        eval_env = EvalEnvironment.capture(self.eval_env, reference=2)
        formula = _drop_intercept(self.formula, self.add_intercept)

        design = dmatrix(formula, data, eval_env=eval_env, NA_action=self.NA_action,
                         return_type='dataframe')
        self.design_ = design.design_info

        if self.return_type == 'dataframe':
            return design
        else:
            return np.array(design)

        self.feature_names_ = design.design_info.column_names
        return np.array(design)

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

5 votes

def decision_function(self, data):
        """Compute decision_function of estimator using formula.

        Transform the data using formula, then predict on it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
        """
        X = np.array(dmatrix(self.design_X_, data))
        return self.estimator_.decision_function(X)

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

5 votes

def predict_proba(self, data):
        """Compute predict_proba with estimator using formula.

        Transform the data using formula, then predict probabilities on it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
        """
        X = np.array(dmatrix(self.design_X_, data))
        return self.estimator_.predict_proba(X)

Source File: patsy_adaptor.py From patsylearn with GNU General Public License v2.0

5 votes

def predict(self, data):
        """Predict with estimator using formula.

        Transform the data using formula, then predict on it
        using the estimator.

        Parameters
        ----------
        data : dict-like (pandas dataframe)
            Input data. Column names need to match variables in formula.
        """
        X = np.array(dmatrix(self.design_X_, data))
        return self.estimator_.predict(X)

Source File: test_mnl_urbansim.py From choicemodels with BSD 3-Clause "New" or "Revised" License

5 votes

def dm(df, test_data):
    return dmatrix(test_data['formula'], data=df, return_type='dataframe')

Source File: test_mnl_new.py From choicemodels with BSD 3-Clause "New" or "Revised" License

5 votes

def test_mnl_prediction(obs, alts):
    """
    Confirm that fitted probabilities in the new codebase match urbansim.urbanchoice.
    Only runs if the urbansim package has been installed.
    
    """
    try:
        from urbansim.urbanchoice.mnl import mnl_simulate
    except:
        print("Comparison of MNL simulation results skipped because urbansim is not installed")
        return

    # produce a fitted model
    mct = MergedChoiceTable(obs, alts, 'choice', 5)
    m = MultinomialLogit(mct, model_expression='obsval + altval - 1')
    results = m.fit()
    
    # get predicted probabilities using choicemodels
    probs1 = results.probabilities(mct)
    
    # compare to probabilities from urbansim.urbanchoice
    dm = dmatrix(results.model_expression, data=mct.to_frame(), return_type='dataframe')

    probs = mnl_simulate(data=dm, coeff=results.fitted_parameters,
                         numalts=mct.sample_size, returnprobs=True)

    df = mct.to_frame()
    df['prob'] = probs.flatten()
    probs2 = df.prob
    
    pd.testing.assert_series_equal(probs1, probs2)

Source File: test_mnl_new.py From choicemodels with BSD 3-Clause "New" or "Revised" License

5 votes

def test_mnl_estimation(obs, alts):
    """
    Confirm that estimated params from the new interface match urbansim.urbanchoice.
    Only runs if the urbansim package has been installed.
    
    """
    try:
        from urbansim.urbanchoice.mnl import mnl_estimate
    except:
        print("Comparison of MNL estimation results skipped because urbansim is not installed")
        return

    model_expression = 'obsval + altval - 1'
    mct = MergedChoiceTable(obs, alts, 'choice')
    
    # new interface
    m = MultinomialLogit(mct, model_expression)
    r = m.fit().get_raw_results()
    
    # old interface
    dm = dmatrix(model_expression, mct.to_frame())
    chosen = np.reshape(mct.to_frame()[mct.choice_col].values, (100, 5))
    log_lik, fit = mnl_estimate(np.array(dm), chosen, numalts=5)
    
    for k,v in log_lik.items():
        assert(v == pytest.approx(r['log_likelihood'][k], 0.00001))
    
    assert_frame_equal(fit, r['fit_parameters'][['Coefficient', 'Std. Error', 'T-Score']])

Source File: combat.py From altanalyze with Apache License 2.0

5 votes

def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print 'Running Combat...',
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)
    
    moved_exp_dir = export.findParentDir(expr_input_dir)+'Non-Combat/'+export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print 'Moved original expression file to:'
        print '\t'+moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(expr_input_dir,removeExtra='#') ### remove comments from the original file
    except Exception: None
    
    pheno = pd.read_table(pheno_dir, index_col=0)
    dat = pd.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    #print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod)
    print "...Combat completed in %.2f seconds" % (time.time() - t)
    
    print 'Original expression file over-written with batch effect removal results...'
    ebat.to_csv(expr_input_dir, sep="\t")

Source File: test_regression_transforms.py From yatsm with MIT License

5 votes

def test_harmonic_transform():
    x = np.arange(735688, 735688 + 100, 1)
    design = patsy.dmatrix('0 + harm(x, 1)')

    truth = np.vstack((np.cos(2 * np.pi / 365.25 * x),
                       np.sin(2 * np.pi / 365.25 * x))).T

    np.testing.assert_equal(np.asarray(design), truth)

Source File: glm.py From py-glm with BSD 3-Clause "New" or "Revised" License

5 votes

def _make_rhs_matrix(self, X):
        formula_parts = self.formula.split('~')
        if len(formula_parts) == 2:
            rhs_formula = formula_parts[1].strip()
        elif len(formula_parts) == 1:
            rhs_formula = formula_parts.strip()
        else:
            raise ValueError(
                f"Cannot parse model formula {self.formula} to determine right hand side!")
        X = pt.dmatrix(rhs_formula, X)
        return X

Source File: markers.py From dynamo-release with BSD 3-Clause "New" or "Revised" License

5 votes

def diff_test_helper(data,
                     fullModelFormulaStr="~cr(time, df=3)",
                     reducedModelFormulaStr="~1",
                     ):
    # Dividing data into train and validation datasets
    transformed_x = dmatrix(fullModelFormulaStr, data, return_type='dataframe')
    transformed_x_null = dmatrix(reducedModelFormulaStr, data, return_type='dataframe')

    expression = data['expression']
    poisson_training_results = sm.GLM(expression, transformed_x, family=sm.families.Poisson()).fit()
    poisson_df = pd.DataFrame({'mu': poisson_training_results.mu, 'expression': expression})
    poisson_df['AUX_OLS_DEP'] = poisson_df.apply(lambda x: ((x['expression'] - x['mu']) ** 2
                                                            - x['expression']) / x['mu'], axis=1)
    ols_expr = """AUX_OLS_DEP ~ mu - 1"""
    aux_olsr_results = smf.ols(ols_expr, poisson_df).fit()

    nb2_family = sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])

    try:
        nb2_full = sm.GLM(expression, transformed_x, family=nb2_family).fit()
        nb2_null = sm.GLM(expression, transformed_x_null, family=nb2_family).fit()
    except:
        return ('fail', 'NB2', 1)

    pval = lrt(nb2_full, nb2_null)
    return ('ok', 'NB2', pval)

Source File: patsytransformer.py From scikit-lego with MIT License

5 votes

def fit(self, X, y=None):
        """Fits the estimator"""
        X_ = dmatrix(self.formula, X, NA_action="raise", return_type=self.return_type)

        # check the number of observations hasn't changed. This ought not to
        # be necessary given NA_action='raise' above but just to be safe
        assert np.array(X_).shape[0] == np.array(X).shape[0]
        self.design_info_ = X_.design_info
        return self

Source File: mice.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def _process_kwds(self, kwds, ix):
        kwds = kwds.copy()
        for k in kwds:
            v = kwds[k]
            if isinstance(v, PatsyFormula):
                mat = patsy.dmatrix(v.formula, self.data,
                                    return_type="dataframe")
                mat = np.asarray(mat)[ix, :]
                if mat.shape[1] == 1:
                    mat = mat[:, 0]
                kwds[k] = mat
        return kwds

Source File: test_data.py From vnpy_crypto with MIT License

5 votes

def test_patsy_577():
    X = np.random.random((10, 2))
    df = pandas.DataFrame(X, columns=["var1", "var2"])
    from patsy import dmatrix
    endog = dmatrix("var1 - 1", df)
    np.testing.assert_(data._is_using_patsy(endog, None))
    exog = dmatrix("var2 - 1", df)
    np.testing.assert_(data._is_using_patsy(endog, exog))

Source File: test_mediation.py From vnpy_crypto with MIT License

5 votes

def test_framing_example_moderator():
    # moderation without formulas, generally not useful but test anyway

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv"))

    outcome = np.asarray(data["cong_mesg"])
    outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data,
                                  return_type='dataframe')
    probit = sm.families.links.probit
    outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit()))

    mediator = np.asarray(data["emo"])
    mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data,
                                 return_type='dataframe')
    mediator_model = sm.OLS(mediator, mediator_exog)

    tx_pos = [outcome_exog.columns.tolist().index("treat"),
              mediator_exog.columns.tolist().index("treat")]
    med_pos = outcome_exog.columns.tolist().index("emo")

    ix = (outcome_exog.columns.tolist().index("age"),
          mediator_exog.columns.tolist().index("age"))
    moderators = {ix : 20}
    med = Mediation(outcome_model, mediator_model, tx_pos, med_pos,
                    moderators=moderators)

    # Just a smoke test
    np.random.seed(4231)
    med_rslt = med.fit(method='parametric', n_rep=100)

Source File: test_mediation.py From vnpy_crypto with MIT License

5 votes

def test_framing_example():

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv"))

    outcome = np.asarray(data["cong_mesg"])
    outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data,
                                  return_type='dataframe')
    probit = sm.families.links.probit
    outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit()))

    mediator = np.asarray(data["emo"])
    mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data,
                                 return_type='dataframe')
    mediator_model = sm.OLS(mediator, mediator_exog)

    tx_pos = [outcome_exog.columns.tolist().index("treat"),
              mediator_exog.columns.tolist().index("treat")]
    med_pos = outcome_exog.columns.tolist().index("emo")

    med = Mediation(outcome_model, mediator_model, tx_pos, med_pos,
                    outcome_fit_kwargs={'atol':1e-11})

    np.random.seed(4231)
    para_rslt = med.fit(method='parametric', n_rep=100)
    diff = np.asarray(para_rslt.summary() - framing_para_4231)
    assert_allclose(diff, 0, atol=1e-6)

    np.random.seed(4231)
    boot_rslt = med.fit(method='boot', n_rep=100)
    diff = np.asarray(boot_rslt.summary() - framing_boot_4231)
    assert_allclose(diff, 0, atol=1e-6)

Source File: test_phreg.py From vnpy_crypto with MIT License

5 votes

def test_predict_formula(self):

        n = 100
        np.random.seed(34234)
        time = 50 * np.random.uniform(size=n)
        status = np.random.randint(0, 2, n).astype(np.float64)
        exog = np.random.uniform(1, 2, size=(n, 2))

        df = pd.DataFrame({"time": time, "status": status,
                           "exog1": exog[:, 0], "exog2": exog[:, 1]})

        # Works with "0 +" on RHS but issues warning
        fml = "time ~ exog1 + np.log(exog2) + exog1*exog2"
        model1 = PHReg.from_formula(fml, df, status=status)
        result1 = model1.fit()

        from patsy import dmatrix
        dfp = dmatrix(model1.data.design_info.builder, df)

        pr1 = result1.predict()
        pr2 = result1.predict(exog=df)
        pr3 = model1.predict(result1.params, exog=dfp) # No standard errors
        pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp)

        prl = (pr1, pr2, pr3, pr4)
        for i in range(4):
            for j in range(i):
                assert_allclose(prl[i].predicted_values, prl[j].predicted_values)

        prl = (pr1, pr2, pr4)
        for i in range(3):
            for j in range(i):
                assert_allclose(prl[i].standard_errors, prl[j].standard_errors)

Source File: yatsm.py From yatsm with MIT License

5 votes

def setup(self, df, **config):
        """ Setup model for input dataset and (optionally) return design matrix

        Args:
            df (pandas.DataFrame): Pandas dataframe containing dataset
                attributes (e.g., dates, image ID, path/row, metadata, etc.)
            config (dict): YATSM configuration dictionary from user, including
                'dataset' and 'YATSM' sub-configurations

        Returns:
            numpy.ndarray or None: return design matrix if used by algorithm
        """
        X = patsy.dmatrix(config['YATSM']['design_matrix'], data=df)
        return X

Source File: mnl.py From choicemodels with BSD 3-Clause "New" or "Revised" License

4 votes

def fit(self):
        """
        Fit the model using maximum likelihood estimation. Uses either the ChoiceModels
        or PyLogit estimation engine as appropriate.

        Returns
        -------
        MultinomialLogitResults() object.

        """
        if (self._estimation_engine == 'PyLogit'):

            m = pylogit.create_choice_model(data = self._df,
                                            obs_id_col = self._observation_id_col,
                                            alt_id_col = self._alternative_id_col,
                                            choice_col = self._choice_col,
                                            specification = self._model_expression,
                                            names = self._model_labels,
                                            model_type = 'MNL')

            m.fit_mle(init_vals = self._initial_coefs)
            results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
                                              model_expression = self._model_expression,
                                              results = m)

        elif (self._estimation_engine == 'ChoiceModels'):

            dm = dmatrix(self._model_expression, data=self._df)

            chosen = np.reshape(self._df[[self._choice_col]].values,
                                (self._numobs, self._numalts))

            log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts)

            result_params = dict(log_likelihood = log_lik,
                                 fit_parameters = fit,
                                 x_names = dm.design_info.column_names)

            results = MultinomialLogitResults(estimation_engine = self._estimation_engine,
                                              model_expression = self._model_expression,
                                              results = result_params)

        return results

Source File: mnl.py From choicemodels with BSD 3-Clause "New" or "Revised" License

4 votes

def probabilities(self, data):
        """
        Generate predicted probabilities for a table of choice scenarios, using the fitted
        parameters stored in the results object.
        
        Parameters
        ----------
        data : choicemodels.tools.MergedChoiceTable
            Long-format table of choice scenarios. TO DO - accept other data formats.
        
        Expected class parameters
        -------------------------
        self.model_expression : patsy string
        self.fitted_parameters : list of floats
        
        Returns
        -------
        pandas.Series with indexes matching the input
        
        """
        # TO DO - make sure this handles pylogit case
        
        # TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario
        # are consecutive? seems like a requirement here; should document it
        
        df = data.to_frame()
        numalts = data.sample_size  # TO DO - make this an official MCT param
        
        dm = dmatrix(self.model_expression, data=df)
        
        # utility is sum of data values times fitted betas
        u = np.dot(self.fitted_parameters, np.transpose(dm))
        
        # reshape so axis 0 lists alternatives and axis 1 lists choosers
        u = np.reshape(u, (numalts, u.size // numalts), order='F')
    
        # scale the utilities to make exponentiation easier
        # https://stats.stackexchange.com/questions/304758/softmax-overflow
        u = u - u.max(axis=0)
        
        exponentiated_utility = np.exp(u)
        sum_exponentiated_utility = np.sum(exponentiated_utility, axis=0)
        
        probs = exponentiated_utility / sum_exponentiated_utility
        
        # convert back to ordering of the input data
        probs = probs.flatten(order='F')
        
        df['prob'] = probs  # adds indexes
        return df.prob

Source File: TMLE.py From zEpid with MIT License

4 votes

def exposure_model(self, model, custom_model=None, bound=False, print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self.exposure + ' ~ ' + model
        self.__mweight = model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            # TODO need to create smart warning system
            # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
            #              "certain machine learning algorithms")
            self._exp_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            self.g1W = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]),
                                                ml_model=custom_model, print_results=print_results)

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = _bounding_(self.g1W, bounds=bound)
            self.g0W = _bounding_(self.g0W, bounds=bound)

        self._fit_exposure_model = True

Source File: _combat.py From scanpy with BSD 3-Clause "New" or "Revised" License

4 votes

def _design_matrix(
    model: pd.DataFrame, batch_key: str, batch_levels: Collection[str],
) -> pd.DataFrame:
    """\
    Computes a simple design matrix.

    Parameters
    --------
    model
        Contains the batch annotation
    batch_key
        Name of the batch column
    batch_levels
        Levels of the batch annotation

    Returns
    --------
    The design matrix for the regression problem
    """
    import patsy

    design = patsy.dmatrix(
        "~ 0 + C(Q('{}'), levels=batch_levels)".format(batch_key),
        model,
        return_type="dataframe",
    )
    model = model.drop([batch_key], axis=1)
    numerical_covariates = model.select_dtypes('number').columns.values

    logg.info(f"Found {design.shape[1]} batches\n")
    other_cols = [c for c in model.columns.values if c not in numerical_covariates]

    if other_cols:
        col_repr = " + ".join("Q('{}')".format(x) for x in other_cols)
        factor_matrix = patsy.dmatrix(
            "~ 0 + {}".format(col_repr), model[other_cols], return_type="dataframe"
        )

        design = pd.concat((design, factor_matrix), axis=1)
        logg.info(f"Found {len(other_cols)} categorical variables:")
        logg.info("\t" + ", ".join(other_cols) + '\n')

    if numerical_covariates is not None:
        logg.info(f"Found {len(numerical_covariates)} numerical variables:")
        logg.info("\t" + ", ".join(numerical_covariates) + '\n')

        for nC in numerical_covariates:
            design[nC] = model[nC]

    return design

Source File: bayes_mixed_glm.py From vnpy_crypto with MIT License

4 votes

def from_formula(cls, formula, vc_formulas, data, family=None,
                     vcp_p=1, fe_p=2):
        """
        Fit a BayesMixedGLM using a formula.

        Parameters
        ----------
        formula : string
            Formula for the endog and fixed effects terms (use ~ to separate
            dependent and independent expressions).
        vc_formulas : dictionary
            vc_formulas[name] is a one-sided formula that creates one
            collection of random effects with a common variance
            prameter.  If using a categorical expression to produce
            variance components, note that generally `0 + ...` should
            be used so that an intercept is not included.
        data : data frame
            The data to which the formulas are applied.
        family : genmod.families instance
            A GLM family.
        vcp_p : float
            The prior standard deviation for the logarithms of the standard
            deviations of the random effects.
        fe_p : float
            The prior standard deviation for the fixed effects parameters.
        """

        ident = []
        exog_vc = []
        vcp_names = []
        j = 0
        for na, fml in vc_formulas.items():
            mat = patsy.dmatrix(fml, data, return_type='dataframe')
            exog_vc.append(mat)
            vcp_names.append(na)
            ident.append(j * np.ones(mat.shape[1]))
            j += 1
        exog_vc = pd.concat(exog_vc, axis=1)
        vc_names = exog_vc.columns.tolist()

        ident = np.concatenate(ident)

        model = super(_BayesMixedGLM, cls).from_formula(
            formula, data=data, family=family, subset=None,
            exog_vc=exog_vc, ident=ident, vc_names=vc_names,
            vcp_names=vcp_names, fe_p=fe_p, vcp_p=vcp_p)

        return model

Python patsy.dmatrix() Examples