Python pandas.get_dummies() Examples
The following are 30
code examples of pandas.get_dummies().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_int_df(self, dtype): data = DataFrame( {'A': [1, 2, 1], 'B': pd.Categorical(['a', 'b', 'a']), 'C': [1, 2, 1], 'D': [1., 2., 1.] } ) columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] expected = DataFrame([ [1, 1., 1, 0, 1, 0], [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) expected[columns[2:]] = expected[columns[2:]].astype(dtype) result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) tm.assert_frame_equal(result, expected)
Example #2
Source File: BPNN_Classify_Data.py From Machine-Learning-for-Beginner-by-Python3 with MIT License | 6 votes |
def handle_data(filepath, miss='fill'): # 定义处理数据的函数 data = pd.read_csv(r'%s'%filepath) data = data.replace('?', np.nan) # 处理缺失值 if miss == 'del': # 删除掉缺失值 miss_data = data.dropna(how='any') else: miss_data = data.fillna(method='ffill') # 新建DataFrame newdata = pd.DataFrame() # 独热化编码 for ikey in miss_data: if miss_data[ikey].dtype == 'object': # 独热编码 onedata = pd.get_dummies(miss_data[ikey]) newdata = pd.concat([newdata, onedata], axis=1) else: newdata[ikey] = miss_data[ikey] return newdata
Example #3
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray typ = SparseDtype(dtype, 0) else: arr = np.array typ = dtype expected = DataFrame({'C': [1, 2, 3], 'A_a': arr([1, 0, 1], dtype=typ), 'A_b': arr([0, 1, 0], dtype=typ), 'B_b': arr([1, 1, 0], dtype=typ), 'B_c': arr([0, 0, 1], dtype=typ), 'cat_x': arr([1, 0, 0], dtype=typ), 'cat_y': arr([0, 1, 1], dtype=typ) }).sort_index(axis=1) assert_frame_equal(result, expected)
Example #4
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... result = get_dummies(df, prefix='bad', sparse=sparse) bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], columns=['C'] + bad_columns, dtype=np.uint8) expected = expected.astype({"C": np.int64}) if sparse: # work around astyping & assigning with duplicate columns # https://github.com/pandas-dev/pandas/issues/14427 expected = pd.concat([ pd.Series([1, 2, 3], name='C'), pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'), pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'), pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'), pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'), ], axis=1) assert_frame_equal(result, expected)
Example #5
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_dataframe_dummies_prefix_list(self, df, sparse): prefixes = ['from_A', 'from_B'] result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}, dtype=np.uint8) expected[['C']] = df[['C']] cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] expected = expected[['C'] + cols] typ = pd.SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) assert_frame_equal(result, expected)
Example #6
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) res_list = get_dummies(just_na_list, sparse=sparse) res_series = get_dummies(just_na_series, sparse=sparse) res_series_index = get_dummies(just_na_series_index, sparse=sparse) assert res_list.empty assert res_series.empty assert res_series_index.empty assert res_list.index.tolist() == [0] assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ['A']
Example #7
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_dataframe_dummies_all_obj(self, df, sparse): df = df[['A', 'B']] result = get_dummies(df, sparse=sparse) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}, dtype=np.uint8) if sparse: expected = pd.DataFrame({ "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'), "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'), "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'), "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'), }) assert_frame_equal(result, expected)
Example #8
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_dataframe_dummies_prefix_sep(self, df, sparse): result = get_dummies(df, prefix_sep='..', sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], 'B..c': [0, 0, 1]}, dtype=np.uint8) expected[['C']] = df[['C']] expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=sparse) assert_frame_equal(result, expected)
Example #9
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) res_list = get_dummies(just_na_list, sparse=sparse) res_series = get_dummies(just_na_series, sparse=sparse) res_series_index = get_dummies(just_na_series_index, sparse=sparse) assert res_list.empty assert res_series.empty assert res_series_index.empty assert res_list.index.tolist() == [0] assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ['A']
Example #10
Source File: titanic.py From MachineLearning with Apache License 2.0 | 6 votes |
def process_ticket(combined): # a function that extracts each prefix of the ticket, # returns 'XXX' if no prefix (i.e the ticket is a digit) def cleanTicket(ticket): ticket = ticket.replace('.','') ticket = ticket.replace('/','') ticket = ticket.split() ticket = map(lambda t : t.strip() , ticket) ticket = filter(lambda t : not t.isdigit(), ticket) if len(ticket) > 0: return ticket[0] else: return 'XXX' # Extracting dummy variables from tickets: combined['Ticket'] = combined['Ticket'].map(cleanTicket) tickets_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket') combined = pd.concat([combined, tickets_dummies],axis=1) combined.drop('Ticket',inplace=True,axis=1) return combined
Example #11
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) if sparse: expected = expected.apply(pd.SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected)
Example #12
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected)
Example #13
Source File: SVM_Classify_Data.py From Machine-Learning-for-Beginner-by-Python3 with MIT License | 6 votes |
def trans(exdata, nor=normal, oh=one_hot, bin=binary): keylist = exdata.keys() newexdata = pd.DataFrame() for ikey in range(len(keylist)): if ikey + 1 in nor: newexdata[keylist[ikey]] = (exdata[keylist[ikey]] - exdata[keylist[ikey]].mean()) / exdata[keylist[ikey]].std() elif ikey + 1 in bin: newexdata[keylist[ikey]] = [1 if inum == 1 else -1 for inum in exdata[keylist[ikey]]] elif ikey + 1 in oh: newdata = pd.get_dummies(exdata[keylist[ikey]], prefix=keylist[ikey]) newexdata = pd.concat([newexdata,newdata], axis=1) return newexdata # 类别说明 # Absence (1) 1类 # presence (2) -1类 # 将训练数据平均分为n份,利用K折交叉验证计算模型最终的正确率 # 将训练数据分为训练数据和验证数据
Example #14
Source File: discrete_model.py From vnpy_crypto with MIT License | 6 votes |
def _pandas_to_dummies(endog): if endog.ndim == 2: if endog.shape[1] == 1: yname = endog.columns[0] endog_dummies = get_dummies(endog.iloc[:, 0]) else: # series yname = 'y' endog_dummies = endog else: yname = endog.name endog_dummies = get_dummies(endog) ynames = endog_dummies.columns.tolist() return endog_dummies, ynames, yname #### Private Model Classes ####
Example #15
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), dtype=np.uint8) tm.assert_frame_equal(result, expected)
Example #16
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) exp_na = DataFrame( {'b': [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex(['b', nan], axis=1) if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
Example #17
Source File: test_reshape.py From recruit with Apache License 2.0 | 6 votes |
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat, dtype=dtype) data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected)
Example #18
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'C': [1, 2, 3], 'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c']}) result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}) columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] expected[columns] = expected[columns].astype(np.uint8) assert_frame_equal(result, expected)
Example #19
Source File: cbc_hb.py From lifestyles with MIT License | 6 votes |
def model(profiles, comparisons, selections, sample=2500, alpha_prior_std=10): all_attributes = pd.get_dummies(profiles).columns profiles_dummies = pd.get_dummies(profiles, drop_first=True) choices = pd.concat({profile: profiles_dummies.loc[comparisons[profile]].reset_index(drop=True) for profile in comparisons.columns}, axis=1) respondants = selections.columns n_attributes_in_model = profiles_dummies.shape[1] n_participants = selections.shape[1] with pm.Model(): # https://www.sawtoothsoftware.com/download/ssiweb/CBCHB_Manual.pdf # need to include the covariance matrix as a parent of `partsworth` alpha = pm.Normal('alpha', 0, sd=alpha_prior_std, shape=n_attributes_in_model, testval=np.random.randn(n_attributes_in_model)) partsworth = pm.MvNormal("partsworth", alpha, tau=np.eye(n_attributes_in_model), shape=(n_participants, n_attributes_in_model)) cs = [_create_observation_variable(selection, choices, partsworth[i, :]) for i, (_, selection) in enumerate(selections.iteritems())] trace = pm.sample(sample) return transform_trace_to_individual_summary_statistics(trace, respondants, profiles_dummies.columns, all_attributes)
Example #20
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat, dtype=dtype) data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected)
Example #21
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_int_df(self, dtype): data = DataFrame( {'A': [1, 2, 1], 'B': pd.Categorical(['a', 'b', 'a']), 'C': [1, 2, 1], 'D': [1., 2., 1.] } ) columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] expected = DataFrame([ [1, 1., 1, 0, 1, 0], [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) expected[columns[2:]] = expected[columns[2:]].astype(dtype) result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) tm.assert_frame_equal(result, expected)
Example #22
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(['a', 'b']), dtype=np.uint8) tm.assert_frame_equal(result, expected)
Example #23
Source File: app.py From demo-self-driving with Apache License 2.0 | 6 votes |
def run_the_app(): # To make Streamlit fast, st.cache allows us to reuse computation across runs. # In this common pattern, we download data from an endpoint only once. @st.cache def load_metadata(url): return pd.read_csv(url) # This function uses some Pandas magic to summarize the metadata Dataframe. @st.cache def create_summary(metadata): one_hot_encoded = pd.get_dummies(metadata[["frame", "label"]], columns=["label"]) summary = one_hot_encoded.groupby(["frame"]).sum().rename(columns={ "label_biker": "biker", "label_car": "car", "label_pedestrian": "pedestrian", "label_trafficLight": "traffic light", "label_truck": "truck" }) return summary # An amazing property of st.cached functions is that you can pipe them into # one another to form a computation DAG (directed acyclic graph). Streamlit # recomputes only whatever subset is required to get the right answer! metadata = load_metadata(os.path.join(DATA_URL_ROOT, "labels.csv.gz")) summary = create_summary(metadata) # Uncomment these lines to peek at these DataFrames. # st.write('## Metadata', metadata[:1000], '## Summary', summary[:1000]) # Draw the UI elements to search for objects (pedestrians, cars, etc.) selected_frame_index, selected_frame = frame_selector_ui(summary) if selected_frame_index == None: st.error("No frames fit the criteria. Please select different label or number.") return # Draw the UI element to select parameters for the YOLO object detector. confidence_threshold, overlap_threshold = object_detector_ui() # Load the image from S3. image_url = os.path.join(DATA_URL_ROOT, selected_frame) image = load_image(image_url) # Add boxes for objects on the image. These are the boxes for the ground image. boxes = metadata[metadata.frame == selected_frame].drop(columns=["frame"]) draw_image_with_boxes(image, boxes, "Ground Truth", "**Human-annotated data** (frame `%i`)" % selected_frame_index) # Get the boxes for the objects detected by YOLO by running the YOLO model. yolo_boxes = yolo_v3(image, confidence_threshold, overlap_threshold) draw_image_with_boxes(image, yolo_boxes, "Real-time Computer Vision", "**YOLO v3 Model** (overlap `%3.1f`) (confidence `%3.1f`)" % (overlap_threshold, confidence_threshold)) # This sidebar UI is a little search engine to find certain object types.
Example #24
Source File: EDA.py From G-Bert with MIT License | 6 votes |
def process_side(): print('process_side') side_pd = pd.read_csv(patient_info_file) # just use demographic information to avoid future information leak such as lab test and lab measurements side_pd = side_pd[['subject_id', 'hadm_id', 'icustay_id', 'gender_male', 'admission_type', 'first_icu_stay', 'admission_age', 'ethnicity', 'weight', 'height']] # process side_information side_pd = side_pd.dropna(thresh=4) side_pd.fillna(side_pd.mean(), inplace=True) side_pd = side_pd.groupby(by=['subject_id', 'hadm_id']).head( [1]).reset_index(drop=True) side_pd = pd.concat( [side_pd, pd.get_dummies(side_pd['ethnicity'])], axis=1) side_pd.drop(columns=['ethnicity', 'icustay_id'], inplace=True) side_pd.rename(columns={'subject_id': 'SUBJECT_ID', 'hadm_id': 'HADM_ID'}, inplace=True) return side_pd.reset_index(drop=True)
Example #25
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_dataframe_dummies_with_na(self, df, sparse, dtype): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_b': [1, 1, 0, 0], 'B_c': [0, 0, 1, 0], 'B_nan': [0, 0, 0, 1]}).sort_index(axis=1) e_dtype = self.effective_dtype(dtype) columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] expected[columns] = expected[columns].astype(e_dtype) assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected)
Example #26
Source File: multi_input.py From aboleth with Apache License 2.0 | 6 votes |
def input_fn(df): """Format the downloaded data.""" # Creates a dictionary mapping from each continuous feature column name (k) # to the values of that column stored in a constant Tensor. continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS] X_con = np.stack(continuous_cols).astype(np.float32).T # Standardise X_con -= X_con.mean(axis=0) X_con /= X_con.std(axis=0) # Creates a dictionary mapping from each categorical feature column name categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis] for k in CATEGORICAL_COLUMNS] n_values = [np.amax(c) + 1 for c in categ_cols] X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32) # Converts the label column into a constant Tensor. label = df[LABEL_COLUMN].values[:, np.newaxis] # Returns the feature columns and the label. return X_con, X_cat, n_values, label
Example #27
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=np.uint8) result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected)
Example #28
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) exp_na = DataFrame( {'b': [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na)
Example #29
Source File: model.py From cloudml-samples with Apache License 2.0 | 6 votes |
def generator_input(filenames, chunk_size, batch_size=64): """Produce features and labels needed by keras fit_generator.""" feature_cols = None while True: input_reader = pd.read_csv( tf.gfile.Open(filenames[0]), names=CSV_COLUMNS, chunksize=chunk_size, na_values=' ?') for input_data in input_reader: input_data = input_data.dropna() label = pd.get_dummies(input_data.pop(LABEL_COLUMN)) input_data = to_numeric_features(input_data, feature_cols) # Retains schema for next chunk processing. if feature_cols is None: feature_cols = input_data.columns idx_len = input_data.shape[0] for index in range(0, idx_len, batch_size): yield (input_data.iloc[index:min(idx_len, index + batch_size)], label.iloc[index:min(idx_len, index + batch_size)])
Example #30
Source File: test_reshape.py From vnpy_crypto with MIT License | 6 votes |
def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame(index=np.arange(3)) result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected)