Python Examples of pandas.core.frame.DataFrame.from

Source File: test_stata.py From recruit with Apache License 2.0

5 votes

def test_categorical_order(self, file):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c',
                                       'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd',
                                       'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', [
                     'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', [
                     'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [
                     1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
                     np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_dict(OrderedDict(cols))

        # Read with and with out categoricals, ensure order is identical
        file = getattr(self, file)
        parsed = read_stata(file)
        tm.assert_frame_equal(expected, parsed, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed[col].cat.categories)

Source File: stata.py From recruit with Apache License 2.0

5 votes

def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                                 order_categoricals):
        """
        Converts categorical columns to Categorical type.
        """
        value_labels = list(compat.iterkeys(value_label_dict))
        cat_converted_data = []
        for col, label in zip(data, lbllist):
            if label in value_labels:
                # Explicit call with ordered=True
                cat_data = Categorical(data[col], ordered=order_categoricals)
                categories = []
                for category in cat_data.categories:
                    if category in value_label_dict[label]:
                        categories.append(value_label_dict[label][category])
                    else:
                        categories.append(category)  # Partially labeled
                try:
                    cat_data.categories = categories
                except ValueError:
                    vc = Series(categories).value_counts()
                    repeats = list(vc.index[vc > 1])
                    repeats = '\n' + '-' * 80 + '\n'.join(repeats)
                    raise ValueError('Value labels for column {col} are not '
                                     'unique. The repeated labels are:\n'
                                     '{repeats}'
                                     .format(col=col, repeats=repeats))
                # TODO: is the next line needed above in the data(...) method?
                cat_data = Series(cat_data, index=data.index)
                cat_converted_data.append((col, cat_data))
            else:
                cat_converted_data.append((col, data[col]))
        data = DataFrame.from_dict(OrderedDict(cat_converted_data))
        return data

Source File: stata.py From recruit with Apache License 2.0

5 votes

def _prepare_categoricals(self, data):
        """Check for categorical columns, retain categorical information for
        Stata file and convert categorical data to int"""

        is_cat = [is_categorical_dtype(data[col]) for col in data]
        self._is_col_cat = is_cat
        self._value_labels = []
        if not any(is_cat):
            return data

        get_base_missing_value = StataMissingValue.get_base_missing_value
        data_formatted = []
        for col, col_is_cat in zip(data, is_cat):
            if col_is_cat:
                self._value_labels.append(StataValueLabel(data[col]))
                dtype = data[col].cat.codes.dtype
                if dtype == np.int64:
                    raise ValueError('It is not possible to export '
                                     'int64-based categorical data to Stata.')
                values = data[col].cat.codes.values.copy()

                # Upcast if needed so that correct missing values can be set
                if values.max() >= get_base_missing_value(dtype):
                    if dtype == np.int8:
                        dtype = np.int16
                    elif dtype == np.int16:
                        dtype = np.int32
                    else:
                        dtype = np.float64
                    values = np.array(values, dtype=dtype)

                # Replace missing values with Stata missing value for type
                values[values == -1] = get_base_missing_value(dtype)
                data_formatted.append((col, values))
            else:
                data_formatted.append((col, data[col]))
        return DataFrame.from_dict(OrderedDict(data_formatted))

Source File: test_stata.py From vnpy_crypto with MIT License

5 votes

def test_categorical_order(self, file):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c',
                                       'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd',
                                       'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', [
                     'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', [
                     'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [
                     1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
                     np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_dict(OrderedDict(cols))

        # Read with and with out categoricals, ensure order is identical
        file = getattr(self, file)
        parsed = read_stata(file)
        tm.assert_frame_equal(expected, parsed, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed[col].cat.categories)

Source File: stata.py From vnpy_crypto with MIT License

5 votes

def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                                 order_categoricals):
        """
        Converts categorical columns to Categorical type.
        """
        value_labels = list(compat.iterkeys(value_label_dict))
        cat_converted_data = []
        for col, label in zip(data, lbllist):
            if label in value_labels:
                # Explicit call with ordered=True
                cat_data = Categorical(data[col], ordered=order_categoricals)
                categories = []
                for category in cat_data.categories:
                    if category in value_label_dict[label]:
                        categories.append(value_label_dict[label][category])
                    else:
                        categories.append(category)  # Partially labeled
                try:
                    cat_data.categories = categories
                except ValueError:
                    vc = Series(categories).value_counts()
                    repeats = list(vc.index[vc > 1])
                    repeats = '\n' + '-' * 80 + '\n'.join(repeats)
                    msg = 'Value labels for column {0} are not unique. The ' \
                          'repeated labels are:\n{1}'.format(col, repeats)
                    raise ValueError(msg)
                # TODO: is the next line needed above in the data(...) method?
                cat_data = Series(cat_data, index=data.index)
                cat_converted_data.append((col, cat_data))
            else:
                cat_converted_data.append((col, data[col]))
        data = DataFrame.from_dict(OrderedDict(cat_converted_data))
        return data

Source File: stata.py From vnpy_crypto with MIT License

5 votes

def _prepare_categoricals(self, data):
        """Check for categorical columns, retain categorical information for
        Stata file and convert categorical data to int"""

        is_cat = [is_categorical_dtype(data[col]) for col in data]
        self._is_col_cat = is_cat
        self._value_labels = []
        if not any(is_cat):
            return data

        get_base_missing_value = StataMissingValue.get_base_missing_value
        data_formatted = []
        for col, col_is_cat in zip(data, is_cat):
            if col_is_cat:
                self._value_labels.append(StataValueLabel(data[col]))
                dtype = data[col].cat.codes.dtype
                if dtype == np.int64:
                    raise ValueError('It is not possible to export '
                                     'int64-based categorical data to Stata.')
                values = data[col].cat.codes.values.copy()

                # Upcast if needed so that correct missing values can be set
                if values.max() >= get_base_missing_value(dtype):
                    if dtype == np.int8:
                        dtype = np.int16
                    elif dtype == np.int16:
                        dtype = np.int32
                    else:
                        dtype = np.float64
                    values = np.array(values, dtype=dtype)

                # Replace missing values with Stata missing value for type
                values[values == -1] = get_base_missing_value(dtype)
                data_formatted.append((col, values))
            else:
                data_formatted.append((col, data[col]))
        return DataFrame.from_dict(OrderedDict(data_formatted))

Source File: categorical.py From Computable with MIT License

5 votes

def describe(self):
        """
        Returns a dataframe with frequency and counts by level.
        """
        # Hack?
        from pandas.core.frame import DataFrame
        grouped = DataFrame(self.labels).groupby(0)
        counts = grouped.count().values.squeeze()
        freqs = counts / float(counts.sum())
        return DataFrame.from_dict({
            'counts': counts,
            'freqs': freqs,
            'levels': self.levels
        }).set_index('levels')

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_categorical_order(self, file):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c',
                                       'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd',
                                       'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', [
                     'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', [
                     'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [
                     1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
                     np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_dict(OrderedDict(cols))

        # Read with and with out categoricals, ensure order is identical
        file = getattr(self, file)
        parsed = read_stata(file)
        tm.assert_frame_equal(expected, parsed, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed[col].cat.categories)

Source File: stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                                 order_categoricals):
        """
        Converts categorical columns to Categorical type.
        """
        value_labels = list(compat.iterkeys(value_label_dict))
        cat_converted_data = []
        for col, label in zip(data, lbllist):
            if label in value_labels:
                # Explicit call with ordered=True
                cat_data = Categorical(data[col], ordered=order_categoricals)
                categories = []
                for category in cat_data.categories:
                    if category in value_label_dict[label]:
                        categories.append(value_label_dict[label][category])
                    else:
                        categories.append(category)  # Partially labeled
                try:
                    cat_data.categories = categories
                except ValueError:
                    vc = Series(categories).value_counts()
                    repeats = list(vc.index[vc > 1])
                    repeats = '\n' + '-' * 80 + '\n'.join(repeats)
                    raise ValueError('Value labels for column {col} are not '
                                     'unique. The repeated labels are:\n'
                                     '{repeats}'
                                     .format(col=col, repeats=repeats))
                # TODO: is the next line needed above in the data(...) method?
                cat_data = Series(cat_data, index=data.index)
                cat_converted_data.append((col, cat_data))
            else:
                cat_converted_data.append((col, data[col]))
        data = DataFrame.from_dict(OrderedDict(cat_converted_data))
        return data

Source File: stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def _prepare_categoricals(self, data):
        """Check for categorical columns, retain categorical information for
        Stata file and convert categorical data to int"""

        is_cat = [is_categorical_dtype(data[col]) for col in data]
        self._is_col_cat = is_cat
        self._value_labels = []
        if not any(is_cat):
            return data

        get_base_missing_value = StataMissingValue.get_base_missing_value
        data_formatted = []
        for col, col_is_cat in zip(data, is_cat):
            if col_is_cat:
                self._value_labels.append(StataValueLabel(data[col]))
                dtype = data[col].cat.codes.dtype
                if dtype == np.int64:
                    raise ValueError('It is not possible to export '
                                     'int64-based categorical data to Stata.')
                values = data[col].cat.codes.values.copy()

                # Upcast if needed so that correct missing values can be set
                if values.max() >= get_base_missing_value(dtype):
                    if dtype == np.int8:
                        dtype = np.int16
                    elif dtype == np.int16:
                        dtype = np.int32
                    else:
                        dtype = np.float64
                    values = np.array(values, dtype=dtype)

                # Replace missing values with Stata missing value for type
                values[values == -1] = get_base_missing_value(dtype)
                data_formatted.append((col, values))
            else:
                data_formatted.append((col, data[col]))
        return DataFrame.from_dict(OrderedDict(data_formatted))

Source File: pandas2ri.py From rpy2 with GNU General Public License v2.0

5 votes

def rpy2py_dataframe(obj):
    items = OrderedDict((k, rpy2py(v) if isinstance(v, Sexp) else v)
                        for k, v in obj.items())
    res = PandasDataFrame.from_dict(items)
    res.index = obj.rownames
    return res

Source File: test_stata.py From twitter-stock-recommendation with MIT License

5 votes

def test_categorical_order(self, file):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c',
                                       'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd',
                                       'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', [
                     'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', [
                     'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [
                     1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
                     np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_dict(OrderedDict(cols))

        # Read with and with out categoricals, ensure order is identical
        file = getattr(self, file)
        parsed = read_stata(file)
        tm.assert_frame_equal(expected, parsed, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed[col].cat.categories)

Python pandas.core.frame.DataFrame.from_dict() Examples