Python Examples of pandas.get

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_int_df(self, dtype):
        data = DataFrame(
            {'A': [1, 2, 1],
             'B': pd.Categorical(['a', 'b', 'a']),
             'C': [1, 2, 1],
             'D': [1., 2., 1.]
             }
        )
        columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
        expected = DataFrame([
            [1, 1., 1, 0, 1, 0],
            [2, 2., 0, 1, 0, 1],
            [1, 1., 1, 0, 1, 0]
        ], columns=columns)
        expected[columns[2:]] = expected[columns[2:]].astype(dtype)
        result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
        tm.assert_frame_equal(result, expected)

Source File: BPNN_Classify_Data.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

6 votes

def handle_data(filepath, miss='fill'):  # 定义处理数据的函数
    data = pd.read_csv(r'%s'%filepath)
    data = data.replace('?', np.nan)
    #  处理缺失值
    if miss == 'del':  # 删除掉缺失值
        miss_data = data.dropna(how='any')
    else:
        miss_data = data.fillna(method='ffill')
    #  新建DataFrame
    newdata = pd.DataFrame()
    #  独热化编码
    for ikey in miss_data:
        if miss_data[ikey].dtype == 'object':  # 独热编码
            onedata = pd.get_dummies(miss_data[ikey])
            newdata = pd.concat([newdata, onedata], axis=1)
        else:
            newdata[ikey] = miss_data[ikey]
    return newdata

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
        df['cat'] = pd.Categorical(['x', 'y', 'y'])
        result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
        if sparse:
            arr = SparseArray
            typ = SparseDtype(dtype, 0)
        else:
            arr = np.array
            typ = dtype

        expected = DataFrame({'C': [1, 2, 3],
                              'A_a': arr([1, 0, 1], dtype=typ),
                              'A_b': arr([0, 1, 0], dtype=typ),
                              'B_b': arr([1, 1, 0], dtype=typ),
                              'B_c': arr([0, 0, 1], dtype=typ),
                              'cat_x': arr([1, 0, 0], dtype=typ),
                              'cat_y': arr([0, 1, 1], dtype=typ)
                              }).sort_index(axis=1)

        assert_frame_equal(result, expected)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_dataframe_dummies_prefix_str(self, df, sparse):
        # not that you should do this...
        result = get_dummies(df, prefix='bad', sparse=sparse)
        bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
        expected = DataFrame([[1, 1, 0, 1, 0],
                              [2, 0, 1, 1, 0],
                              [3, 1, 0, 0, 1]],
                             columns=['C'] + bad_columns,
                             dtype=np.uint8)
        expected = expected.astype({"C": np.int64})
        if sparse:
            # work around astyping & assigning with duplicate columns
            # https://github.com/pandas-dev/pandas/issues/14427
            expected = pd.concat([
                pd.Series([1, 2, 3], name='C'),
                pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'),
                pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
                pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
                pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'),
            ], axis=1)

        assert_frame_equal(result, expected)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_dataframe_dummies_prefix_list(self, df, sparse):
        prefixes = ['from_A', 'from_B']
        result = get_dummies(df, prefix=prefixes, sparse=sparse)
        expected = DataFrame({'C': [1, 2, 3],
                              'from_A_a': [1, 0, 1],
                              'from_A_b': [0, 1, 0],
                              'from_B_b': [1, 1, 0],
                              'from_B_c': [0, 0, 1]},
                             dtype=np.uint8)
        expected[['C']] = df[['C']]
        cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
        expected = expected[['C'] + cols]

        typ = pd.SparseArray if sparse else pd.Series
        expected[cols] = expected[cols].apply(lambda x: typ(x))
        assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_just_na(self, sparse):
        just_na_list = [np.nan]
        just_na_series = Series(just_na_list)
        just_na_series_index = Series(just_na_list, index=['A'])

        res_list = get_dummies(just_na_list, sparse=sparse)
        res_series = get_dummies(just_na_series, sparse=sparse)
        res_series_index = get_dummies(just_na_series_index, sparse=sparse)

        assert res_list.empty
        assert res_series.empty
        assert res_series_index.empty

        assert res_list.index.tolist() == [0]
        assert res_series.index.tolist() == [0]
        assert res_series_index.index.tolist() == ['A']

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_dataframe_dummies_all_obj(self, df, sparse):
        df = df[['A', 'B']]
        result = get_dummies(df, sparse=sparse)
        expected = DataFrame({'A_a': [1, 0, 1],
                              'A_b': [0, 1, 0],
                              'B_b': [1, 1, 0],
                              'B_c': [0, 0, 1]},
                             dtype=np.uint8)
        if sparse:
            expected = pd.DataFrame({
                "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
                "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
                "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
                "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
            })

        assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_dataframe_dummies_prefix_sep(self, df, sparse):
        result = get_dummies(df, prefix_sep='..', sparse=sparse)
        expected = DataFrame({'C': [1, 2, 3],
                              'A..a': [1, 0, 1],
                              'A..b': [0, 1, 0],
                              'B..b': [1, 1, 0],
                              'B..c': [0, 0, 1]},
                             dtype=np.uint8)
        expected[['C']] = df[['C']]
        expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
        assert_frame_equal(result, expected)

        result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
        expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
        assert_frame_equal(result, expected)

        result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
                             sparse=sparse)
        assert_frame_equal(result, expected)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_just_na(self, sparse):
        just_na_list = [np.nan]
        just_na_series = Series(just_na_list)
        just_na_series_index = Series(just_na_list, index=['A'])

        res_list = get_dummies(just_na_list, sparse=sparse)
        res_series = get_dummies(just_na_series, sparse=sparse)
        res_series_index = get_dummies(just_na_series_index, sparse=sparse)

        assert res_list.empty
        assert res_series.empty
        assert res_series_index.empty

        assert res_list.index.tolist() == [0]
        assert res_series.index.tolist() == [0]
        assert res_series_index.index.tolist() == ['A']

Source File: titanic.py From MachineLearning with Apache License 2.0

6 votes

def process_ticket(combined):
    # a function that extracts each prefix of the ticket,
    # returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip() , ticket)
        ticket = filter(lambda t : not t.isdigit(), ticket)
        if len(ticket) > 0:
            return ticket[0]
        else:
            return 'XXX'

    # Extracting dummy variables from tickets:
    combined['Ticket'] = combined['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket')
    combined = pd.concat([combined, tickets_dummies],axis=1)
    combined.drop('Ticket',inplace=True,axis=1)
    return combined

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        if sparse:
            expected = expected.apply(pd.SparseArray, fill_value=0.0)
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_basic_drop_first_one_level(self, sparse):
        # Test the case that categorical variable only has one level.
        s_list = list('aaa')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame(index=np.arange(3))

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected = DataFrame(index=list('ABC'))
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

Source File: SVM_Classify_Data.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

6 votes

def trans(exdata, nor=normal, oh=one_hot, bin=binary):
    keylist = exdata.keys()
    newexdata = pd.DataFrame()
    for ikey in range(len(keylist)):
        if ikey + 1 in nor:
            newexdata[keylist[ikey]] = (exdata[keylist[ikey]] - exdata[keylist[ikey]].mean()) / exdata[keylist[ikey]].std()
        elif ikey + 1 in bin:
            newexdata[keylist[ikey]] = [1 if inum == 1 else -1 for inum in exdata[keylist[ikey]]]
        elif ikey + 1 in oh:
            newdata = pd.get_dummies(exdata[keylist[ikey]], prefix=keylist[ikey])
            newexdata = pd.concat([newexdata,newdata], axis=1)
    return newexdata


# 类别说明
# Absence (1) 1类
# presence (2) -1类

#  将训练数据平均分为n份，利用K折交叉验证计算模型最终的正确率
#  将训练数据分为训练数据和验证数据

Source File: discrete_model.py From vnpy_crypto with MIT License

6 votes

def _pandas_to_dummies(endog):
    if endog.ndim == 2:
        if endog.shape[1] == 1:
            yname = endog.columns[0]
            endog_dummies = get_dummies(endog.iloc[:, 0])
        else:  # series
            yname = 'y'
            endog_dummies = endog
    else:
        yname = endog.name
        endog_dummies = get_dummies(endog)
    ynames = endog_dummies.columns.tolist()

    return endog_dummies, ynames, yname


#### Private Model Classes ####

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_int_int(self):
        data = Series([1, 2, 1])
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=[1, 2],
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

        data = Series(pd.Categorical(['a', 'b', 'a']))
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=pd.Categorical(['a', 'b']),
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0)

        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame(
            {'b': [0, 1, 0],
             nan: [0, 0, 1]},
            dtype=np.uint8).reindex(['b', nan], axis=1)
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na)

Source File: test_reshape.py From recruit with Apache License 2.0

6 votes

def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
        # GH13854
        for ordered in [False, True]:
            cat = pd.Categorical(list("xy"), categories=list("xyz"),
                                 ordered=ordered)
            result = get_dummies(cat, dtype=dtype)

            data = np.array([[1, 0, 0], [0, 1, 0]],
                            dtype=self.effective_dtype(dtype))
            cols = pd.CategoricalIndex(cat.categories,
                                       categories=cat.categories,
                                       ordered=ordered)
            expected = DataFrame(data, columns=cols,
                                 dtype=self.effective_dtype(dtype))

            tm.assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_dataframe_dummies_prefix_dict(self, sparse):
        prefixes = {'A': 'from_A', 'B': 'from_B'}
        df = DataFrame({'C': [1, 2, 3],
                        'A': ['a', 'b', 'a'],
                        'B': ['b', 'b', 'c']})
        result = get_dummies(df, prefix=prefixes, sparse=sparse)

        expected = DataFrame({'C': [1, 2, 3],
                              'from_A_a': [1, 0, 1],
                              'from_A_b': [0, 1, 0],
                              'from_B_b': [1, 1, 0],
                              'from_B_c': [0, 0, 1]})

        columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
        expected[columns] = expected[columns].astype(np.uint8)
        assert_frame_equal(result, expected)

Source File: cbc_hb.py From lifestyles with MIT License

6 votes

def model(profiles, comparisons, selections, sample=2500, alpha_prior_std=10):
    all_attributes = pd.get_dummies(profiles).columns
    profiles_dummies = pd.get_dummies(profiles, drop_first=True)
    choices = pd.concat({profile: profiles_dummies.loc[comparisons[profile]].reset_index(drop=True) for profile in comparisons.columns}, axis=1)

    respondants = selections.columns
    n_attributes_in_model = profiles_dummies.shape[1]
    n_participants = selections.shape[1]

    with pm.Model():

        # https://www.sawtoothsoftware.com/download/ssiweb/CBCHB_Manual.pdf
        # need to include the covariance matrix as a parent of `partsworth`
        alpha = pm.Normal('alpha', 0, sd=alpha_prior_std, shape=n_attributes_in_model, testval=np.random.randn(n_attributes_in_model))
        partsworth = pm.MvNormal("partsworth", alpha, tau=np.eye(n_attributes_in_model), shape=(n_participants, n_attributes_in_model))

        cs = [_create_observation_variable(selection, choices, partsworth[i, :]) for i, (_, selection) in enumerate(selections.iteritems())]

        trace = pm.sample(sample)
    return transform_trace_to_individual_summary_statistics(trace, respondants, profiles_dummies.columns, all_attributes)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
        # GH13854
        for ordered in [False, True]:
            cat = pd.Categorical(list("xy"), categories=list("xyz"),
                                 ordered=ordered)
            result = get_dummies(cat, dtype=dtype)

            data = np.array([[1, 0, 0], [0, 1, 0]],
                            dtype=self.effective_dtype(dtype))
            cols = pd.CategoricalIndex(cat.categories,
                                       categories=cat.categories,
                                       ordered=ordered)
            expected = DataFrame(data, columns=cols,
                                 dtype=self.effective_dtype(dtype))

            tm.assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_int_df(self, dtype):
        data = DataFrame(
            {'A': [1, 2, 1],
             'B': pd.Categorical(['a', 'b', 'a']),
             'C': [1, 2, 1],
             'D': [1., 2., 1.]
             }
        )
        columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
        expected = DataFrame([
            [1, 1., 1, 0, 1, 0],
            [2, 2., 0, 1, 0, 1],
            [1, 1., 1, 0, 1, 0]
        ], columns=columns)
        expected[columns[2:]] = expected[columns[2:]].astype(dtype)
        result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
        tm.assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_int_int(self):
        data = Series([1, 2, 1])
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=[1, 2],
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

        data = Series(pd.Categorical(['a', 'b', 'a']))
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=pd.Categorical(['a', 'b']),
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

Source File: app.py From demo-self-driving with Apache License 2.0

6 votes

def run_the_app():
    # To make Streamlit fast, st.cache allows us to reuse computation across runs.
    # In this common pattern, we download data from an endpoint only once.
    @st.cache
    def load_metadata(url):
        return pd.read_csv(url)

    # This function uses some Pandas magic to summarize the metadata Dataframe.
    @st.cache
    def create_summary(metadata):
        one_hot_encoded = pd.get_dummies(metadata[["frame", "label"]], columns=["label"])
        summary = one_hot_encoded.groupby(["frame"]).sum().rename(columns={
            "label_biker": "biker",
            "label_car": "car",
            "label_pedestrian": "pedestrian",
            "label_trafficLight": "traffic light",
            "label_truck": "truck"
        })
        return summary

    # An amazing property of st.cached functions is that you can pipe them into
    # one another to form a computation DAG (directed acyclic graph). Streamlit
    # recomputes only whatever subset is required to get the right answer!
    metadata = load_metadata(os.path.join(DATA_URL_ROOT, "labels.csv.gz"))
    summary = create_summary(metadata)

    # Uncomment these lines to peek at these DataFrames.
    # st.write('## Metadata', metadata[:1000], '## Summary', summary[:1000])

    # Draw the UI elements to search for objects (pedestrians, cars, etc.)
    selected_frame_index, selected_frame = frame_selector_ui(summary)
    if selected_frame_index == None:
        st.error("No frames fit the criteria. Please select different label or number.")
        return

    # Draw the UI element to select parameters for the YOLO object detector.
    confidence_threshold, overlap_threshold = object_detector_ui()

    # Load the image from S3.
    image_url = os.path.join(DATA_URL_ROOT, selected_frame)
    image = load_image(image_url)

    # Add boxes for objects on the image. These are the boxes for the ground image.
    boxes = metadata[metadata.frame == selected_frame].drop(columns=["frame"])
    draw_image_with_boxes(image, boxes, "Ground Truth",
        "**Human-annotated data** (frame `%i`)" % selected_frame_index)

    # Get the boxes for the objects detected by YOLO by running the YOLO model.
    yolo_boxes = yolo_v3(image, confidence_threshold, overlap_threshold)
    draw_image_with_boxes(image, yolo_boxes, "Real-time Computer Vision",
        "**YOLO v3 Model** (overlap `%3.1f`) (confidence `%3.1f`)" % (overlap_threshold, confidence_threshold))

# This sidebar UI is a little search engine to find certain object types.

Source File: EDA.py From G-Bert with MIT License

6 votes

def process_side():
    print('process_side')

    side_pd = pd.read_csv(patient_info_file)
    # just use demographic information to avoid future information leak such as lab test and lab measurements
    side_pd = side_pd[['subject_id', 'hadm_id', 'icustay_id',
                       'gender_male', 'admission_type', 'first_icu_stay', 'admission_age',
                       'ethnicity', 'weight', 'height']]

    # process side_information
    side_pd = side_pd.dropna(thresh=4)
    side_pd.fillna(side_pd.mean(), inplace=True)
    side_pd = side_pd.groupby(by=['subject_id', 'hadm_id']).head(
        [1]).reset_index(drop=True)
    side_pd = pd.concat(
        [side_pd, pd.get_dummies(side_pd['ethnicity'])], axis=1)
    side_pd.drop(columns=['ethnicity', 'icustay_id'], inplace=True)
    side_pd.rename(columns={'subject_id': 'SUBJECT_ID',
                            'hadm_id': 'HADM_ID'}, inplace=True)
    return side_pd.reset_index(drop=True)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_dataframe_dummies_with_na(self, df, sparse, dtype):
        df.loc[3, :] = [np.nan, np.nan, np.nan]
        result = get_dummies(df, dummy_na=True,
                             sparse=sparse, dtype=dtype).sort_index(axis=1)
        expected = DataFrame({'C': [1, 2, 3, np.nan],
                              'A_a': [1, 0, 1, 0],
                              'A_b': [0, 1, 0, 0],
                              'A_nan': [0, 0, 0, 1],
                              'B_b': [1, 1, 0, 0],
                              'B_c': [0, 0, 1, 0],
                              'B_nan': [0, 0, 0, 1]}).sort_index(axis=1)

        e_dtype = self.effective_dtype(dtype)
        columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
        expected[columns] = expected[columns].astype(e_dtype)
        assert_frame_equal(result, expected)

        result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
        expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
        assert_frame_equal(result, expected)

Source File: multi_input.py From aboleth with Apache License 2.0

6 votes

def input_fn(df):
    """Format the downloaded data."""
    # Creates a dictionary mapping from each continuous feature column name (k)
    # to the values of that column stored in a constant Tensor.
    continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS]
    X_con = np.stack(continuous_cols).astype(np.float32).T

    # Standardise
    X_con -= X_con.mean(axis=0)
    X_con /= X_con.std(axis=0)

    # Creates a dictionary mapping from each categorical feature column name
    categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis]
                  for k in CATEGORICAL_COLUMNS]
    n_values = [np.amax(c) + 1 for c in categ_cols]
    X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32)

    # Converts the label column into a constant Tensor.
    label = df[LABEL_COLUMN].values[:, np.newaxis]

    # Returns the feature columns and the label.
    return X_con, X_cat, n_values, label

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_basic_drop_first(self, sparse):
        # GH12402 Add a new parameter `drop_first` to avoid collinearity
        # Basic case
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=np.uint8)

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame(
            {'b': [0, 1, 0],
             nan: [0, 0, 1]},
            dtype=np.uint8).reindex(['b', nan], axis=1)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na)

Source File: model.py From cloudml-samples with Apache License 2.0

6 votes

def generator_input(filenames, chunk_size, batch_size=64):
    """Produce features and labels needed by keras fit_generator."""

    feature_cols = None
    while True:
        input_reader = pd.read_csv(
            tf.gfile.Open(filenames[0]),
            names=CSV_COLUMNS,
            chunksize=chunk_size,
            na_values=' ?')

        for input_data in input_reader:
            input_data = input_data.dropna()
            label = pd.get_dummies(input_data.pop(LABEL_COLUMN))

            input_data = to_numeric_features(input_data, feature_cols)

            # Retains schema for next chunk processing.
            if feature_cols is None:
                feature_cols = input_data.columns

            idx_len = input_data.shape[0]
            for index in range(0, idx_len, batch_size):
                yield (input_data.iloc[index:min(idx_len, index + batch_size)],
                       label.iloc[index:min(idx_len, index + batch_size)])

Source File: test_reshape.py From vnpy_crypto with MIT License

6 votes

def test_basic_drop_first_one_level(self, sparse):
        # Test the case that categorical variable only has one level.
        s_list = list('aaa')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame(index=np.arange(3))

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected = DataFrame(index=list('ABC'))
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

Python pandas.get_dummies() Examples