Python Examples of pandas.DataFrames

Source File: checks.py From bulwark with GNU Lesser General Public License v3.0

6 votes

def is_same_as(df, df_to_compare, **kwargs):
    """Asserts that two pd.DataFrames are equal.

    Args:
        df (pd.DataFrame): Any pd.DataFrame.
        df_to_compare (pd.DataFrame): A second pd.DataFrame.
        **kwargs (dict): Keyword arguments passed through to pandas' ``assert_frame_equal``.

    Returns:
        Original `df`.

    """
    try:
        tm.assert_frame_equal(df, df_to_compare, **kwargs)
    except AssertionError as exc:
        raise AssertionError("DataFrames are not equal") from exc
    return df

Source File: correlations.py From pysystemtrade with GNU General Public License v3.0

6 votes

def __init__(self, corr_list, column_names, fit_dates):
        """
        Returns a time series of forecasts for a particular instrument

        :param instrument_code:
        :type str:

        :param rule_variation_list:
        :type list: list of str to get forecasts for, if None uses get_trading_rule_list

        :returns: TxN pd.DataFrames; columns rule_variation_name

        """

        setattr(self, "corr_list", corr_list)
        setattr(self, "columns", column_names)
        setattr(self, "fit_dates", fit_dates)

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

6 votes

def get_capped_forecast(self, instrument_code, rule_variation_name):
        """
        Get the capped forecast from the previous module


        KEY INPUT

        :param instrument_code:
        :type str:

        :param rule_variation_name:
        :type str: name of the trading rule variation

        :returns: Tx1 pd.DataFrames

        """
        return self.parent.forecastScaleCap.get_capped_forecast(
            instrument_code, rule_variation_name)

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

6 votes

def get_forecast_weights(self, instrument_code):
        """
        Get the capped forecast from the previous module

        KEY INPUT

        :param instrument_code:
        :type str:

        :param rule_variation_name:
        :type str: name of the trading rule variation

        :returns: dict of Tx1 pd.DataFrames

        """
        return self.parent.combForecast.get_forecast_weights(instrument_code)

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

6 votes

def get_daily_returns_volatility(self, instrument_code):
        """
        Get the daily return (not %) volatility from previous stage, or calculate

        KEY INPUT

        :param instrument_code:
        :type str:

        :returns: Tx1 pd.DataFrames

        """

        system = self.parent
        if hasattr(system, "rawdata"):
            returns_vol = system.rawdata.daily_returns_volatility(
                instrument_code)
        else:
            price = self.get_daily_price(instrument_code)
            returns_vol = robust_vol_calc(price.diff())

        return returns_vol

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

6 votes

def get_aligned_forecast(self, instrument_code, rule_variation_name):
        """
        Get the capped forecast aligned to daily prices


        KEY INPUT

        :param instrument_code:
        :type str:

        :param rule_variation_name:
        :type str: name of the trading rule variation

        :returns: Tx1 pd.DataFrames

        """
        price = self.get_daily_price(instrument_code)
        forecast = self.get_capped_forecast(instrument_code,
                                            rule_variation_name)

        forecast = forecast.reindex(price.index).ffill()

        return forecast

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

5 votes

def get_forecast_diversification_multiplier(self, instrument_code):
        """
        Get the f.d.m from the previous module

        KEY INPUT

        :param instrument_code:
        :type str:

        :returns: dict of Tx1 pd.DataFrames

        """
        return self.parent.combForecast.get_forecast_diversification_multiplier(
            instrument_code)

Source File: conftest.py From kartothek with MIT License

5 votes

def meta_partitions_evaluation_dataframe(metadata_version):
    """
    Create a list of MetaPartitions for testing. The partitions
    include in-memory pd.DataFrames without external references, i.e. files
     are empty

    """
    df = pd.DataFrame(
        OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [1]), ("PRED", [10])])
    )
    mp = MetaPartition(
        label="cluster_1_1", data={"PRED": df}, metadata_version=metadata_version
    )
    df_2 = pd.DataFrame(
        OrderedDict([("P", [1]), ("L", [1]), ("HORIZON", [2]), ("PRED", [20])])
    )
    mp2 = MetaPartition(
        label="cluster_1_2", data={"PRED": df_2}, metadata_version=metadata_version
    )
    df_3 = pd.DataFrame(
        OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [1]), ("PRED", [10])])
    )
    mp3 = MetaPartition(
        label="cluster_2_1", data={"PRED": df_3}, metadata_version=metadata_version
    )
    df_4 = pd.DataFrame(
        OrderedDict([("P", [2]), ("L", [2]), ("HORIZON", [2]), ("PRED", [20])])
    )
    mp4 = MetaPartition(
        label="cluster_2_2", data={"PRED": df_4}, metadata_version=metadata_version
    )
    return [mp, mp2, mp3, mp4]

Source File: conftest.py From kartothek with MIT License

5 votes

def meta_partitions_dataframe_function(metadata_version):
    """
    Create a list of MetaPartitions for testing. The partitions
    include in-memory pd.DataFrames without external references, i.e. files
     are empty

    """
    return _get_meta_partitions_with_dataframe(metadata_version)

Source File: conftest.py From kartothek with MIT License

5 votes

def meta_partitions_dataframe(metadata_version):
    """
    Create a list of MetaPartitions for testing. The partitions
    include in-memory pd.DataFrames without external references, i.e. files
     are empty

    """
    with cm_frozen_time(TIME_TO_FREEZE):
        return _get_meta_partitions_with_dataframe(metadata_version)

Source File: theta.py From sktime with BSD 3-Clause "New" or "Revised" License

5 votes

def compute_pred_int(self, y_pred, alpha=DEFAULT_ALPHA):
        """
        Get the prediction intervals for the forecast. If alpha is iterable,
        multiple
        intervals will be calculated.
        """
        errors = self._compute_pred_errors(alpha=alpha)

        # for multiple alphas, errors come in a list;
        # for single alpha, they come as a single pd.Series,
        # wrap it here into a list to make it iterable,
        # to avoid code duplication
        if isinstance(errors, pd.Series):
            errors = [errors]

        # compute prediction intervals
        pred_int = [
            pd.DataFrame({
                "lower": y_pred - error,
                "upper": y_pred + error
            })
            for error in errors
        ]

        # for a single alpha, return single pd.DataFrame
        if len(pred_int) == 1:
            return pred_int[0]

        # otherwise return list of pd.DataFrames
        return pred_int

Source File: forecast_combine.py From pysystemtrade with GNU General Public License v3.0

5 votes

def get_all_forecasts(self, instrument_code, rule_variation_list=None):
        """
        Returns a data frame of forecasts for a particular instrument

        KEY INPUT

        :param instrument_code:
        :type str:

        :param rule_variation_list:
        :type list: list of str to get forecasts for, if None uses get_trading_rule_list

        :returns: TxN pd.DataFrames; columns rule_variation_name

        >>> from systems.tests.testdata import get_test_object_futures_with_rules_and_capping
        >>> from systems.basesystem import System
        >>> (fcs, rules, rawdata, data, config)=get_test_object_futures_with_rules_and_capping()
        >>> system1=System([rawdata, rules, fcs, ForecastCombineFixed()], data, config)
        >>> system1.combForecast.get_all_forecasts("EDOLLAR",["ewmac8"]).tail(2)
                      ewmac8
        2015-12-10 -0.190583
        2015-12-11  0.871231
        >>>
        >>> system2=System([rawdata, rules, fcs, ForecastCombineFixed()], data, config)
        >>> system2.combForecast.get_all_forecasts("EDOLLAR").tail(2)
                     ewmac16    ewmac8
        2015-12-10  3.134462 -0.190583
        2015-12-11  3.606243  0.871231
        """

        if rule_variation_list is None:
            rule_variation_list = self.get_trading_rule_list(
                instrument_code)

        forecasts = self.get_forecasts_given_rule_list(instrument_code, rule_variation_list)

        return forecasts

Source File: forecast_combine.py From pysystemtrade with GNU General Public License v3.0

5 votes

def get_capped_forecast(self, instrument_code, rule_variation_name):
        """
        Get the capped forecast from the previous module

        KEY INPUT

        :param instrument_code:
        :type str:

        :param rule_variation_name:
        :type str: name of the trading rule variation

        :returns: dict of Tx1 pd.DataFrames; keynames rule_variation_name

        >>> from systems.tests.testdata import get_test_object_futures_with_rules_and_capping
        >>> from systems.basesystem import System
        >>> (fcs, rules, rawdata, data, config)=get_test_object_futures_with_rules_and_capping()
        >>> system=System([rawdata, rules, fcs, ForecastCombineFixed()], data, config)
        >>> system.combForecast.get_capped_forecast("EDOLLAR","ewmac8").tail(2)
                      ewmac8
        2015-12-10 -0.190583
        2015-12-11  0.871231
        """

        return self.parent.forecastScaleCap.get_capped_forecast(
            instrument_code, rule_variation_name)

Source File: accounts_inputs.py From pysystemtrade with GNU General Public License v3.0

5 votes

def get_daily_price(self, instrument_code):
        """
        Get the instrument price from rawdata

        Cached as data isn't cached

        :param instrument_code:
        :type str:

        :returns: Tx1 pd.DataFrames

        """
        return self.parent.data.daily_prices(instrument_code)

Source File: struct.py From quantipy with MIT License

5 votes

def set_qp_multiindex(df, x, y):
    '''
    Takes a pd.DataFrames and applies Quantipy's Question/Values
    layout to it by creating a multiindex on both axes.

    Parameters
    ----------
    df : pd.DataFrame

    x, y : str
        Variable names from the processed case data input,
        i.e. the link definition.

    Returns
    -------
    df : pd.Dataframe (Quantipy convention, multiindexed)
    '''
    axis_labels = ['Question', 'Values']
    df.index = pd.MultiIndex.from_product([[x], df.index], names=axis_labels)
    if y is None:
        df.columns = pd.MultiIndex.from_product([[x], df.columns], names=axis_labels)
    elif y == '@':
        df.columns = pd.MultiIndex.from_product([[x], df.columns], names=axis_labels)
    else:
        df.columns = pd.MultiIndex.from_product([[y], df.columns], names=axis_labels)

    return df

Source File: functions.py From quantipy with MIT License

5 votes

def set_qp_multiindex(df, x, y):
    '''
    Takes a pd.DataFrames and applies Quantipy's Question/Values
    layout to it by creating a multiindex on both axes.

    Parameters
    ----------
    df : pd.DataFrame

    x, y : str
        Variable names from the processed case data input,
        i.e. the link definition.

    Returns
    -------
    df : pd.Dataframe (Quantipy convention, multiindexed)
    '''
    axis_labels = ['Question', 'Values']
    df.index = pd.MultiIndex.from_product([[x], df.index], names=axis_labels)
    if y is None:
        df.columns = pd.MultiIndex.from_product([[x], df.columns], names=axis_labels)
    elif y == '@':
        df.columns = pd.MultiIndex.from_product([[x], '@'], names=axis_labels)
    else:
        df.columns = pd.MultiIndex.from_product([[y], df.columns], names=axis_labels)

    return df

Source File: functions.py From quantipy with MIT License

5 votes

def apply_viewdf_layout(df, x, y):
    '''
    Takes a pd.DataFrames and applies Quantipy's Question/Values
    layout to it by creating a multiindex on both axes.

    Parameters
    ----------
    df : pd.DataFrame

    x, y : str
        Variable names from the processed case data input,
        i.e. the link definition.

    Returns
    -------
    df : pd.Dataframe (multiindexed)
    '''
    axis_labels = ['Question', 'Values']
    df.index = pd.MultiIndex.from_product([[x], df.index], names=axis_labels)
    if y is None:
        df.columns = pd.MultiIndex.from_product([[x], df.columns], names=axis_labels)
    elif y == '@':
        df.columns = pd.MultiIndex.from_product([[x], '@'], names=axis_labels)
    else:
        df.columns = pd.MultiIndex.from_product([[y], df.columns], names=axis_labels)

    return df

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Source File: load_data.py From CrypTen with MIT License

5 votes

def read_data(data_dir, dates):
    """Builds dataframe for model and func benchmarks Assumes directory is structured as
     DATA_PATH
        |_2020-02-20
            |_func_benchmarks.csv
            |_model_benchmarks.csv

    Args:
        data_dir (pathlib.path): path containing month subdirectories
        dates (list of str): containing dates / subdirectories available

    Returns: tuple of pd.DataFrames containing func and model benchmarks with dates
    """
    func_df, model_df = None, None

    for date in dates:
        path = os.path.join(data_dir, date)
        tmp_func_df = pd.read_csv(os.path.join(path, "func_benchmarks.csv"))
        tmp_model_df = pd.read_csv(os.path.join(path, "model_benchmarks.csv"))
        tmp_func_df["date"], tmp_model_df["date"] = date, date
        if func_df is None:
            func_df = tmp_func_df.copy()
            model_df = tmp_model_df.copy()
        else:
            func_df = func_df.append(tmp_func_df)
            model_df = model_df.append(tmp_model_df)

    func_df = compute_runtime_gap(func_df)
    func_df = add_error_bars(func_df)
    return func_df, model_df

Source File: transform_problem.py From estimagic with BSD 3-Clause "New" or "Revised" License

5 votes

def _check_params(params):
    """Check params has a unique index and contains no columns to be created internally.

    Args:
        params (pd.DataFrame or list of pd.DataFrames): See :ref:`params`.

    Raises:
        AssertionError: The index contains duplicates.
        ValueError: The DataFrame contains internal columns.

    """
    assert (
        not params.index.duplicated().any()
    ), "No duplicates allowed in the index of params."

    invalid_names = [
        "_fixed",
        "_fixed_value",
        "_is_fixed_to_value",
        "_is_fixed_to_other",
    ]
    invalid_present_columns = []
    for col in params.columns:
        if col in invalid_names or col.startswith("_internal"):
            invalid_present_columns.append(col)

    if len(invalid_present_columns) > 0:
        msg = (
            "Column names starting with '_internal' and as well as any other of the "
            f"following columns are not allowed in params:\n{invalid_names}."
            f"This is violated for:\n{invalid_present_columns}."
        )
        raise ValueError(msg)

Source File: data_processing.py From AIAlpha with MIT License

4 votes

def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False):
        """
        Splits the dataset into train and test
        :param df_x: dataframe of x variables
        :type df_x: pd.DataFrame
        :param df_y: dataframe of y values
        :type df_y: pd.DataFrame
        :param window: the prediction window
        :type window: int
        :param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column)
        :type has_y: boolean
        :return: train_x, train_y, test_x, test_y
        :rtype: pd.DataFrames
        """
        if has_y:
            y_values = df_y.copy()
            y_values.columns = ['y_values']
            fulldata = df_x.copy()
        else:
            if window == 0:
                y_values = df_x['close'].copy()
                y_values.columns = ['y_values']
                fulldata = df_x.copy()
            else:
                y_values = np.log(df_x['close'].copy()/df_x['close'].copy().shift(-window)).dropna()
                y_values.columns = ['y_values']
                fulldata = df_x.iloc[:-window, :].copy()           
        if binary_y:
            y_values.loc[y_values['y_values']<0] = -1
            y_values.loc[y_values['y_values']>0] = 1
            y_values.loc[y_values['y_values']==0] = 0
        print(y_values.shape)
        print(fulldata.shape)
        train_y = y_values.iloc[:int(len(y_values)*self.split)]
        test_y = y_values.iloc[int(len(y_values)*self.split)+1:]

        train_x = fulldata.iloc[:int(len(y_values)*self.split), :]
        test_x = fulldata.iloc[int(len(y_values)*self.split)+1:len(y_values), :]

        print(train_y.shape)
        print(train_x.shape)

        if save_csv:
            train_x.to_csv(f'data/processed_data/{csv_path}/train_x.csv')
            train_y.to_csv(f'data/processed_data/{csv_path}/train_y.csv', header=['y_values'])
            test_x.to_csv(f'data/processed_data/{csv_path}/test_x.csv')
            test_y.to_csv(f'data/processed_data/{csv_path}/test_y.csv', header=['y_values'])
            fulldata.to_csv(f'data/processed_data/{csv_path}/full_x.csv')
            y_values.to_csv(f'data/processed_data/{csv_path}/full_y.csv', header=['y_values'])
        return fulldata, y_values, train_x, train_y, test_x, test_y

Source File: pdutils.py From pysystemtrade with GNU General Public License v3.0

4 votes

def find_dates_when_label_changes(original_data, new_data, col_names=dict(data='PRICE',
                                                                                        label='PRICE_CONTRACT')):
    """
    For two pd.DataFrames with 2 columns, including a label column, find the date after which the labelling
     is consistent across columns

    >>> s1=pd.DataFrame(dict(PRICE=[1,2,3,np.nan], PRICE_CONTRACT = ["a", "a", "b", "b"]), index=['a1','a2','a3','a4'])
    >>> s2=pd.DataFrame(dict(PRICE=[  2,3,4], PRICE_CONTRACT = [          "b", "b", "b"]), index=['a2','a3','a4'])
    >>> find_dates_when_label_changes(s1, s2)
    ('a3', 'a2')
    >>> s2=pd.DataFrame(dict(PRICE=[  2,3,4], PRICE_CONTRACT = [          "a", "b", "b"]), index=['a2','a3','a4'])
    >>> find_dates_when_label_changes(s1, s2)
    ('a2', 'a1')
    >>> s2=pd.DataFrame(dict(PRICE=[  2,3,4], PRICE_CONTRACT = [          "c", "c", "c"]), index=['a2','a3','a4'])
    >>> find_dates_when_label_changes(s1, s2)
    mismatch_on_last_day
    >>> find_dates_when_label_changes(s1, s1)
    original index matches new
    >>> s2=pd.DataFrame(dict(PRICE=[1, 2,3,4], PRICE_CONTRACT = ["a","c", "c", "c"]), index=['a1','a2','a3','a4'])
    >>> find_dates_when_label_changes(s1, s2)
    mismatch_on_last_day

    :param original_data: some data
    :param new_data: some new data
    :param col_names: dict of str
    :return: tuple or object if match didn't work out
    """
    label_column = col_names['label']

    joint_labels = pd.concat([original_data[label_column],
                                        new_data[label_column]], axis=1)
    joint_labels.columns = ['current', 'new']
    joint_labels = joint_labels.sort_index()

    new_data_start = new_data.index[0]

    existing_labels_in_new_period = joint_labels['current'][new_data_start:].ffill()
    new_labels_in_new_period = joint_labels['new'][new_data_start:].ffill()

    # Find the last date when the labels didn't match, and the first date after that
    match_data=\
        find_dates_when_series_starts_matching(existing_labels_in_new_period, new_labels_in_new_period)

    if match_data is mismatch_on_last_day:
        ## Can't use any of new data
        return mismatch_on_last_day

    elif match_data is all_labels_match:
        ## Can use entire series becuase all match
        if new_data.index[0] == original_data.index[0]:
            # They are same size, so have to use whole of original data
            return original_index_matches_new
        else:
            ## All the new data matches
            first_date_after_series_mismatch = new_data_start
            last_date_when_series_mismatch = original_data.index[original_data.index < new_data_start][-1]
    else:
        first_date_after_series_mismatch, last_date_when_series_mismatch = match_data

    return first_date_after_series_mismatch, last_date_when_series_mismatch

Source File: transform_problem.py From estimagic with BSD 3-Clause "New" or "Revised" License

4 votes

def _pre_process_arguments(
    params, algorithm, algo_options, logging, dashboard, dash_options
):
    """Process user supplied arguments without affecting the optimization problem.

    Args:
        params (pd.DataFrame or list of pd.DataFrames): See :ref:`params`.
        algorithm (str or list of strings): Identifier of the optimization algorithm.
            See :ref:`list_of_algorithms` for supported values.
        algo_options (dict or list of dicts):
            algorithm specific configurations for the optimization
        dashboard (bool): Whether to create and show a dashboard, default is False.
            See :ref:`dashboard` for details.
        dash_options (dict or list of dict, optional): Options passed to the dashboard.
            Supported keys are:
                - port (int): port where to display the dashboard
                - no_browser (bool): whether to display the dashboard in a browser
                - rollover (int): how many iterations to keep in the monitoring plots

    Returns:
        optim_kwargs (dict): dictionary collecting the arguments that are going to be
            passed to _internal_minimize
        params (pd.DataFrame): The expanded params DataFrame with all needed columns.
            See :ref:`params`.
        database_path (str or pathlib.Path or None): path to the database.

    """
    standard_dash_options = {"no_browser": False, "port": None, "rollover": 500}
    # important for dash_options to be last for standards to be overwritten
    dash_options = {**standard_dash_options, **dash_options}

    origin, algo_name = _process_algorithm(algorithm)
    optim_kwargs = {
        "origin": origin,
        "algo_name": algo_name,
        "algo_options": algo_options,
    }

    params = _set_params_defaults_if_missing(params)
    _check_params(params)

    database_path = logging if dashboard else None

    return optim_kwargs, params, dash_options, database_path

Python pandas.DataFrames() Examples