Python Examples of pandas.Dataframe

Source File: glove_preprocessor.py From interpret-text with MIT License

7 votes

def preprocess(self, data) -> pd.DataFrame:
        """ Convert a list of text into a dataframe containing padded token ids,
        masks distinguishing word tokens from pads, and word token counts for
        each text in the list.
        :param data: list of strings (e.g. sentences)
        :type data: list
        :return: tokens (pd.Dataframe): a dataframe containing
            lists of word token ids, pad/word masks, and token counts
            for each string in the list
        :rtype: pandas dataframe
        """
        token_lists = []
        masks = []
        counts = []
        for sentence in data:
            token_list, mask = self.generate_tokens(sentence)
            token_lists.append(token_list)
            masks.append(mask)
            counts.append(np.sum(mask))
        tokens = pd.DataFrame(
            {"tokens": token_lists, "mask": masks, "counts": counts}
        )
        return tokens

Source File: generic.py From pyiron with BSD 3-Clause "New" or "Revised" License

6 votes

def queue_table(self, project_only=True, recursive=True, full_table=False):
        """
        Display the queuing system table as pandas.Dataframe

        Args:
            project_only (bool): Query only for jobs within the current project - True by default
            recursive (bool): Include jobs from sub projects
            full_table (bool): Whether to show the entire pandas table

        Returns:
            pandas.DataFrame: Output from the queuing system - optimized for the Sun grid engine
        """
        return queue_table(
            job_ids=self.get_job_ids(recursive=recursive), project_only=project_only,
            full_table=full_table
        )

Source File: forecaster.py From prophet with MIT License

6 votes

def add_group_component(self, components, name, group):
        """Adds a component with given name that contains all of the components
        in group.

        Parameters
        ----------
        components: Dataframe with components.
        name: Name of new group component.
        group: List of components that form the group.

        Returns
        -------
        Dataframe with components.
        """
        new_comp = components[components['component'].isin(set(group))].copy()
        group_cols = new_comp['col'].unique()
        if len(group_cols) > 0:
            new_comp = pd.DataFrame({'col': group_cols, 'component': name})
            components = components.append(new_comp)
        return components

Source File: glm_reporter.py From nistats with BSD 3-Clause "New" or "Revised" License

6 votes

def _dataframe_to_html(df, precision, **kwargs):
    """ Makes HTML table from provided dataframe.
    Removes HTML5 non-compliant attributes (ex: `border`).

    Parameters
    ----------
    df: pandas.Dataframe
        Dataframe to be converted into HTML table.

    precision: int
        The display precision for float values in the table.

    **kwargs: keyworded arguments
        Supplies keyworded arguments for func: pandas.Dataframe.to_html()

    Returns
    -------
    html_table: String
        Code for HTML table.
    """
    with pd.option_context('display.precision', precision):
        html_table = df.to_html(**kwargs)
    html_table = html_table.replace('border="1" ', '')
    return html_table

Source File: io.py From code-for-the-world with MIT License

6 votes

def read_selig(path):
    """Read a Selig-style airfoil file

    Parameters
    -----------
    path : str
        Path to the Selig-stle .dat file.

    Returns
    -------
    air_df : pd.Dataframe
        Pandas Dataframe containing x- and y-coordinates of airfoil data.
    """
    air_df = pd.read_csv(path, delim_whitespace=True,
                         header=0)
    air_df.columns = ['x', 'y']
    return air_df

Source File: forecaster.py From prophet with MIT License

6 votes

def predict_uncertainty(self, df):
        """Prediction intervals for yhat and trend.

        Parameters
        ----------
        df: Prediction dataframe.

        Returns
        -------
        Dataframe with uncertainty intervals.
        """
        sim_values = self.sample_posterior_predictive(df)

        lower_p = 100 * (1.0 - self.interval_width) / 2
        upper_p = 100 * (1.0 + self.interval_width) / 2

        series = {}
        for key in ['yhat', 'trend']:
            series['{}_lower'.format(key)] = self.percentile(
                sim_values[key], lower_p, axis=1)
            series['{}_upper'.format(key)] = self.percentile(
                sim_values[key], upper_p, axis=1)

        return pd.DataFrame(series)

Source File: forecaster.py From prophet with MIT License

6 votes

def predictive_samples(self, df):
        """Sample from the posterior predictive distribution.

        Parameters
        ----------
        df: Dataframe with dates for predictions (column ds), and capacity
            (column cap) if logistic growth.

        Returns
        -------
        Dictionary with keys "trend" and "yhat" containing
        posterior predictive samples for that component.
        """
        df = self.setup_dataframe(df.copy())
        sim_values = self.sample_posterior_predictive(df)
        return sim_values

Source File: QADataStruct.py From QUANTAXIS with MIT License

6 votes

def __init__(self, DataFrame):
        """Stock Transaction

        Arguments:
            DataFrame {pd.Dataframe} -- [input is one/multi day transaction]
        """

        self.type = 'stock_transaction'

        self.data = DataFrame
        if 'amount' not in DataFrame.columns:
            if 'vol' in DataFrame.columns:
                self.data['amount'] = self.data.vol * self.data.price * 100
            elif 'volume' in DataFrame.columns:
                self.data['amount'] = self.data.volume * self.data.price * 100
        if '_id' in DataFrame.columns:
            self.data = self.data.drop(["_id"], axis=1)
        self.mongo_coll = DATABASE.stock_transaction

Source File: QADataStruct.py From QUANTAXIS with MIT License

6 votes

def __init__(self, DataFrame):
        """Index Transaction

        Arguments:
            DataFrame {pd.Dataframe} -- [input is one/multi day transaction]
        """

        self.type = 'index_transaction'

        self.data = DataFrame
        if 'amount' not in DataFrame.columns:
            if 'vol' in DataFrame.columns:
                self.data['amount'] = self.data.vol * self.data.price * 100
            elif 'volume' in DataFrame.columns:
                self.data['amount'] = self.data.volume * self.data.price * 100
        if '_id' in DataFrame.columns:
            self.data = self.data.drop(["_id"], axis=1)
        self.mongo_coll = DATABASE.index_transaction

Source File: generic.py From pyiron with BSD 3-Clause "New" or "Revised" License

6 votes

def get_jobs(self, recursive=True, columns=None):
        """
        Internal function to return the jobs as dictionary rather than a pandas.Dataframe

        Args:
            recursive (bool): search subprojects [True/False]
            columns (list): by default only the columns ['id', 'project'] are selected, but the user can select a subset
                            of ['id', 'status', 'chemicalformula', 'job', 'subjob', 'project', 'projectpath',
                            'timestart', 'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid',
                            'masterid']

        Returns:
            dict: columns are used as keys and point to a list of the corresponding values
        """
        if not isinstance(self.db, FileTable):
            return get_jobs(
                database=self.db,
                sql_query=self.sql_query,
                user=self.user,
                project_path=self.project_path,
                recursive=recursive,
                columns=columns,
            )
        else:
            return self.db.get_jobs(project=self.project_path, recursive=recursive, columns=columns)

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def __init__(self, invariants, n, frequency=252):
        """

        :param invariants: sample data of market invariants
        :type invariants: pd.Dataframe
        :param n: number of assets
        :type n: int
        :param frequency: time horizon of projection
        :type frequency: int
        """
        if not isinstance(invariants, pd.DataFrame):
            warnings.warn("invariants is not pd.Dataframe", RuntimeWarning)
        self.invariants = invariants
        self.S = self.invariants.cov()
        self.frequency = frequency
        self.n = n

Source File: generic.py From pyiron with BSD 3-Clause "New" or "Revised" License

6 votes

def queue_table_global(self, full_table=False):
        """
        Display the queuing system table as pandas.Dataframe

        Args:
            full_table (bool): Whether to show the entire pandas table

        Returns:
            pandas.DataFrame: Output from the queuing system - optimized for the Sun grid engine
        """
        df = queue_table(job_ids=[], project_only=False, full_table=full_table)
        if len(df) != 0 and self.db is not None:
            return pandas.DataFrame(
                [
                    self.db.get_item_by_id(
                        int(str(queue_ID).replace("pi_", "").replace(".sh", ""))
                    )
                    for queue_ID in df["jobname"]
                    if str(queue_ID).startswith("pi_")
                ]
            )
        else:
            return None

Source File: hdfio.py From pyiron with BSD 3-Clause "New" or "Revised" License

6 votes

def get_from_table(self, path, name):
        """
        Get a specific value from a pandas.Dataframe

        Args:
            path (str): relative path to the data object
            name (str): parameter key

        Returns:
            dict, list, float, int: the value associated to the specific parameter key
        """
        df_table = self.get(path)
        keys = df_table["Parameter"]
        if name in keys:
            job_id = keys.index(name)
            return df_table["Value"][job_id]
        raise ValueError("Unknown name: {0}".format(name))

Source File: schemas.py From CityEnergyAnalyst with MIT License

6 votes

def validate(self, df):
        """Check to make sure the Dataframe conforms to the schema"""
        expected_columns = set(self.schema["schema"]["columns"].keys())
        found_columns = set(df.columns.values)

        # handle some extra cases
        if "PIPE0" in expected_columns:
            found_columns = {c for c in found_columns if not c.startswith("PIPE")}
            found_columns.add("PIPE0")

        # handle some extra cases
        if "NODE0" in expected_columns:
            found_columns = {c for c in found_columns if not c.startswith("NODE")}
            found_columns.add("NODE0")

        if not found_columns == expected_columns:
            missing_columns = expected_columns - found_columns
            extra_columns = found_columns - expected_columns

            warnings.warn("Dataframe does not conform to schemas.yml specification for {lm}"
                          "(missing: {missing_columns}, extra: {extra_columns}".format(
                lm=self.lm, missing_columns=missing_columns, extra_columns=extra_columns))

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def __init__(self, invariants, n, dist="normal"):
        """

        :param invariants: sample data of market invariants
        :type invariants: pd.Dataframe
        :param n: number of assets
        :type n: int
        :param dist: choice of distribution: "normal"
        :type dist: str
        """
        self.invariants = invariants
        self.dist = dist
        self.n = n
        self.mean = None
        self.cov = None
        self.skew = None
        self.kurt = None

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def exp_cov(invariants, span=180, frequency=252):
    """
    Calculates sample exponentially weighted covariance

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection
    :type frequency: int
    :param span: the span for exponential weights
    :return: sample exponentially weighted covariance dataframe
    """
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    assets = invariants.columns
    daily_cov = invariants.ewm(span=span).cov().iloc[-len(assets):, -len(assets):]
    return pd.DataFrame(daily_cov*frequency)

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def sample_moment(invariants, order, frequency=252):
    """
    Calculates nth moment of sample data.

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param order: order of moment
    :type order: int
    :param frequency: time horizon of projection
    :type frequency: int
    :return: nth moment of sample invariants
    """
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    daily_moment = moment(invariants, moment=order)
    return daily_moment*frequency

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def sample_coM4(invariants):
    """
    Calculates sample fourth order co-moment matrix
    Taps into the R package PerformanceAnalytics through rpy2

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection, default set ot 252 days
    :type frequency: int
    :return: sample skew dataframe
    """
    
    importr('PerformanceAnalytics')
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    p = invariants.shape[1]
    coskew_function = robjects.r('M4.MM')
    r_inv_vec = robjects.FloatVector(np.concatenate(invariants.values))
    r_invariants = robjects.r.matrix(r_inv_vec,nrow=p,ncol=p)
    r_M4 = coskew_function(r_invariants)
    
    return np.matrix(r_M4)

Source File: moment_est.py From OptimalPortfolio with MIT License

6 votes

def sample_coM3(invariants):
    """
    Calculates sample third order co-moment matrix
    Taps into the R package PerformanceAnalytics through rpy2

    :param invariants: sample data of market invariants
    :type invariants: pd.Dataframe
    :param frequency: time horizon of projection, default set ot 252 days
    :type frequency: int
    :return: sample skew dataframe
    """
    
    importr('PerformanceAnalytics')
    if not isinstance(invariants, pd.DataFrame):
        warnings.warn("invariants not a pd.Dataframe", RuntimeWarning)
        invariants = pd.DataFrame(invariants)
    p = invariants.shape[1]
    coskew_function = robjects.r('M3.MM')
    r_inv_vec = robjects.FloatVector(np.concatenate(invariants.values))
    r_invariants = robjects.r.matrix(r_inv_vec,nrow=p,ncol=p)
    r_M3 = coskew_function(r_invariants)
    
    return np.matrix(r_M3)

Source File: invariants.py From OptimalPortfolio with MIT License

6 votes

def forex_invariants(prices, no_assets):
    """
    Calculates forex price invariants, which are the compounded returns
    :param prices: stock prices data of the various tickers
    :type prices: pd Dataframe
    :param no_assets: number of assets in data
    :type no_assets: int
    :return: dataframe of stock invariants
    :rtype: pd.Dataframe
    """
    if not isinstance(prices, pd.DataFrame):
        warnings.warn("prices are not a pd Dataframe", RuntimeWarning)

    asset_ret = pd.DataFrame()
    for j in range(no_assets):
        returns = []
        for i in range(1, len(prices)):
            log_ret = np.log(prices.iloc[i, j] / prices.iloc[i-1, j])
            returns.append(log_ret)
        asset_ret = pd.concat([pd.DataFrame(returns), asset_ret], axis=1, ignore_index=True)
    return asset_ret

Source File: invariants.py From OptimalPortfolio with MIT License

6 votes

def stock_invariants(prices, no_assets):
    """
    Calculates stock price invariants, which are the compounded returns
    :param prices: stock prices data of the various tickers
    :type prices: pd Dataframe
    :param no_assets: number of assets in data
    :type no_assets: int
    :return: dataframe of stock invariants
    :rtype: pd.Dataframe
    """
    if not isinstance(prices, pd.DataFrame):
        warnings.warn("prices are not a pd Dataframe", RuntimeWarning)

    asset_ret = pd.DataFrame()
    for j in range(no_assets):
        returns = []
        for i in range(1, len(prices)):
            log_ret = np.log(prices.iloc[i, j] / prices.iloc[i-1, j])
            returns.append(log_ret)
        asset_ret = pd.concat([pd.DataFrame(returns), asset_ret], axis=1, ignore_index=True)
    return asset_ret

Source File: pdutils.py From pysystemtrade with GNU General Public License v3.0

6 votes

def dataframe_pad(starting_df, column_list, padwith=0.0):
    """
    Takes a dataframe and adds extra columns if neccessary so we end up with columns named column_list

    :param starting_df: A pd.dataframe with named columns
    :param column_list: A list of column names
    :param padwith: The value to pad missing columns with
    :return: pd.Dataframe
    """

    def _pad_column(column_name, starting_df, padwith):
        if column_name in starting_df.columns:
            return starting_df[column_name]
        else:
            return pd.Series([0.0] * len(starting_df.index), starting_df.index)

    new_data = [
        _pad_column(column_name, starting_df, padwith)
        for column_name in column_list
    ]

    new_df = pd.concat(new_data, axis=1)
    new_df.columns = column_list

    return new_df

Source File: logistic_regressor.py From autoimpute with MIT License

6 votes

def predict_proba(self, X):
        """Predict probabilities of class membership for logistic regression.

        The regression uses the pooled parameters from each of the imputed
        datasets to generate a set of single predictions. The pooled params
        come from multiply imputed datasets, but the predictions themselves
        follow the same rules as an logistic regression. Because this is
        logistic regression, the sigmoid function is applied to the result
        of the normal equation, giving us probabilities between 0 and 1 for
        each prediction. This method returns those probabilities.

        Args:
            X (pd.Dataframe): predictors to predict response

        Returns:
            np.array: prob of class membership for predicted observations.
        """

        # run validation first
        X = self._predict_strategy_validator(self, X)

        # get the alpha and betas, then create linear equation for predictions
        alpha = self.statistics_["coefs"].values[0]
        betas = self.statistics_["coefs"].values[1:]
        return self._sigmoid(alpha + np.dot(X, betas))

Source File: parallel.py From pyiron with BSD 3-Clause "New" or "Revised" License

6 votes

def output_to_pandas(self, sort_by=None, h5_path="output"):
        """
        Convert output of all child jobs to a pandas Dataframe object.

        Args:
            sort_by (str): sort the output using pandas.DataFrame.sort_values(by=sort_by)
            h5_path (str): select child output to include - default='output'

        Returns:
            pandas.Dataframe: output as dataframe
        """
        # TODO: The output to pandas function should no longer be required
        with self.project_hdf5.open(h5_path) as hdf:
            for key in hdf.list_nodes():
                self._output[key] = hdf[key]
        df = pandas.DataFrame(self._output)
        if sort_by is not None:
            df = df.sort_values(by=sort_by)
        return df

    # TODO: make it more general and move it then into genericJob

Source File: exchange_algorithm.py From catalyst with Apache License 2.0

5 votes

def get_frame_stats(self):
        """
        preparing the stats before analyze
        :return: stats: pd.Dataframe
        """
        # add the last day stats which is not saved in the directory
        current_stats = pd.DataFrame(self.frame_stats)
        current_stats.set_index('period_close', drop=False, inplace=True)

        # get the location of the directory
        algo_folder = get_algo_folder(self.algo_namespace)
        folder = join(algo_folder, 'frame_stats')

        if exists(folder):
            files = [f for f in listdir(folder) if isfile(join(folder, f))]

            period_stats_list = []
            for item in files:
                filename = join(folder, item)

                with open(filename, 'rb') as handle:
                    perf_period = pickle.load(handle)
                    period_stats_list.extend(perf_period)

            stats = pd.DataFrame(period_stats_list)
            stats.set_index('period_close', drop=False, inplace=True)

            return pd.concat([stats, current_stats])
        else:
            return current_stats

Source File: accounting.py From pysystemtrade with GNU General Public License v3.0

5 votes

def to_frame(self, curve_type="net"):
        """
        Returns individual return curves as a data frame

        :param curve_type: gross, net or costs
        :type curve_type: str

        :returns: pd.Dataframe TxN
        """

        actype = getattr(self, curve_type)

        return actype.to_frame()

Source File: split_data.py From vae-anomaly-detector with MIT License

5 votes

def split_data(dataframe, split):
    """
    Split the data into a training set
    and a test set according to 'train_size'
    Args:
        dataframe: (pandas.Dataframe)
        split: (list of float) train/valid/test split
    """
    split_idx = _split_indices(dataframe, split)
    train_data = dataframe.iloc[split_idx['train']]
    test_data = dataframe.iloc[split_idx['test']]
    return train_data, test_data

Source File: agg.py From quantipy with MIT License

5 votes

def make_default_cat_view(link, weights=None):
    '''
    This function is creates Quantipy's default categorical aggregations:
    The x axis has to be a catgeorical single or multicode variable, the y axis
    can be generated from either categorical (single or multicode) or numeric
    (int/float). Numeric y axes are categorized into unique column codes.  

    Acts as a wrapper around _df_to_value_matrix(), _aggregate_matrix() and
    set_qp_multiindex().

    Parameters
    ----------
    data : pd.DataFrame

    x, y : str
        Variable names from the processed case data input,
        i.e. the link definition.

    weighted : bool
        Controls if the aggregation is performed on weighted or weighted data.

    Returns
    -------
    view_df : pd.Dataframe (multiindexed)
    '''
    mat, xdef, ydef = get_matrix(link, weights)
    mat = weight_matrix(mat, xdef)
    df = _default_cat_df(mat, xdef, ydef)
    view_df = struct.set_qp_multiindex(df, link.x, link.y)
    
    return view_df

Source File: struct.py From quantipy with MIT License

5 votes

def deep_drop(df, targets, axes=[0, 1]):
    '''
    Drops all columns given in the targets list from the defined
    axes of the passed dataframe. The dataframe is allowed to be
    multiindexed on both axes.

    Parameters
    ---------
    df : pd.DataFrame

    targets : string or sequence of strings
        Columns to be dropped.

    axes : list of int, default = [0, 1]
        Specification of the axes to drop from.
        Will perform the drop on both axes by default.

    Returns
    -------
    df : pd.Dataframe
    '''
    if not isinstance(targets, (list, tuple)):
        targets = [targets]

    if not isinstance(axes, (list, tuple)):
        axes = [axes]

    levels = (len(df.index.levels), len(df.columns.levels))

    for axis in axes:
        for level in range(1, levels[axis])[::2]:
            for target in targets:
                df = df.drop(target, axis=axis, level=level)

    return df

Source File: clean.py From cfanalytics with BSD 3-Clause "New" or "Revised" License

5 votes

def _overall_percentile(self):
        """Add an overall percentile column.
        
        Returns
        -------
        cfopendata : pd.Dataframe
            Crossfit open data with add overall percentile columns.
        """         
        col = self.df['Overall_rank']
        pct = np.flip(np.round(np.linspace(0, 100, num=len(self.df)),
                               decimals=4), 0)
        # Check for duplications
        pct = self._rm_dups(col, pct)
        self.cleandata.iloc[:,10] = pct
        return self

Python pandas.Dataframe() Examples