Python dask.dataframe.read_csv() Examples
The following are 19
code examples of dask.dataframe.read_csv().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask.dataframe
, or try the search function
.
Example #1
Source File: tf_utils.py From recommender-tensorflow with MIT License | 6 votes |
def tf_csv_dataset(csv_path, label_col, col_defaults, shuffle=False, batch_size=32): df = dd.read_csv(csv_path) # use col_defaults if specified for col, else use defaults base on col type type_defaults = {np.int64: 0, np.float64: 0.0, np.object_: ""} record_defaults = [[col_defaults.get(col_name, type_defaults.get(col_type.type, ""))] for col_name, col_type in df.dtypes.items()] def parse_csv(value): columns = tf.decode_csv(value, record_defaults) features = dict(zip(df.columns.tolist(), columns)) label = features[label_col] return features, label # read, parse, shuffle and batch dataset dataset = tf.data.TextLineDataset(csv_path).skip(1) # skip header if shuffle: dataset = dataset.shuffle(buffer_size=1024) dataset = dataset.map(parse_csv, num_parallel_calls=8) dataset = dataset.batch(batch_size) return dataset
Example #2
Source File: data_loaders.py From nlp-recipes with MIT License | 6 votes |
def __init__(self, file_path, block_size=10e6, random_seed=None, lines=True): """Initializes the loader. Args: file_path (str): Path to delimited file. block_size (int, optional): Size of partition in bytes. See dask.dataframe.read_csv() Defaults to 10e6. random_seed (int, optional): Random seed. See random.seed(). Defaults to None. lines (bool, optional): Read the file as a json object per line. Defaults to True. """ self.df = dd.read_json(file_path, blocksize=block_size, lines=lines) self.random_seed = random_seed random.seed(random_seed)
Example #3
Source File: NucleiClassification.py From HistomicsTK with Apache License 2.0 | 6 votes |
def read_feature_file(args): fname, feature_file_format = os.path.splitext(args.inputNucleiFeatureFile) if feature_file_format == '.csv': ddf = dd.read_csv(args.inputNucleiFeatureFile) elif feature_file_format == '.h5': ddf = dd.read_hdf(args.inputNucleiFeatureFile, 'Features') else: raise ValueError('Extension of output feature file must be .csv or .h5') return ddf
Example #4
Source File: update_landsat_metadata.py From Landsat578 with Apache License 2.0 | 6 votes |
def split_list(_list=LATEST): print('Please wait while scene metadata is split') try: csv = read_csv(_list, dtype={'PRODUCT_ID': object, 'COLLECTION_NUMBER': object, 'COLLECTION_CATEGORY': object}, blocksize=25e6, parse_dates=True) except EmptyDataError: print('Metadata has already been updated for the day.') return None csv = csv[csv.COLLECTION_NUMBER != 'PRE'] sats = unique(csv.SPACECRAFT_ID).tolist() for sat in sats: print(sat) df = csv[csv.SPACECRAFT_ID == sat] dst = os.path.join(SCENES, sat) if os.path.isfile(dst): os.remove(dst) if not os.path.isdir(dst): os.mkdir(dst) df.to_parquet('{}'.format(dst)) return None
Example #5
Source File: data_loaders.py From nlp-recipes with MIT License | 6 votes |
def __init__(self, file_path, sep=",", header="infer", block_size=10e6, random_seed=None): """Initializes the loader. Args: file_path (str): Path to delimited file. sep (str, optional): Delimiter. Defaults to ",". header (str, optional): Number of rows to be used as the header. See pandas.read_csv() Defaults to "infer". block_size (int, optional): Size of partition in bytes. See dask.dataframe.read_csv() Defaults to 10e6. random_seed (int, optional): Random seed. See random.seed(). Defaults to None. """ self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size) self.random_seed = random_seed random.seed(random_seed)
Example #6
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def kmeans_input_fn(self, name, csv_path=None): """Input function for kmeans Arguments: name : string, Name of the data [Train or Eval] csv_path : The path of the csv on any storage system Returns: A batch of features """ pattern = self._get_pattern(name, csv_path) tf.logging.info('The Pattern of files is : %s', pattern) df = dd.read_csv(pattern) vectors = dask.compute(df.values) return tf.train.limit_epochs( tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1)
Example #7
Source File: make_parquet.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def main(): client = Client() # noqa categories = ["category_%d" % i for i in range(26)] columns = ["click"] + ["numeric_%d" % i for i in range(13)] + categories df = dd.read_csv("day_1", sep="\t", names=columns, header=None) encoding = {c: "bytes" for c in categories} fixed = {c: 8 for c in categories} df.to_parquet( "day-1-bytes.parquet", object_encoding=encoding, fixed_text=fixed, compression="SNAPPY", )
Example #8
Source File: test_fit_predict.py From dask-lightgbm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_regress_newsread(client, listen_port): data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None) dX = data.iloc[:, 1:] dy = data.iloc[:, 0] d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=listen_port) d_regress.fit(dX, dy) dy_pred = d_regress.predict(dX, client=client) # The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves numerator = ((dy - dy_pred) ** 2).sum() denominator = ((dy - dy.mean()) ** 2).sum() r2_score = 1 - numerator / denominator r2_score = r2_score.compute() print(r2_score) assert r2_score > 0.8
Example #9
Source File: print.py From autogbt-alt with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True) args = parser.parse_args() path = str(Path(args.input)/'*.csv') df = dd.read_csv( path, dtype={ 'n_trials': 'float64', }, ).compute() df = df.groupby(['model', 'dataset']).agg({ 'duration[s]': ['mean', 'std'], 'CV AUC': ['mean', 'std'], }) dummy = pd.DataFrame(index=[ ('xgb', 'avazu'), ('lgb', 'avazu'), ]) df = df.append(dummy) df.columns = ['%s_%s' % (a, b) for a, b in df.columns] df = df.reset_index() for c in ['duration[s]', 'CV AUC']: mean = '%s_mean' % (c) std = '%s_std' % (c) df[std] = df[std].apply(lambda d: '±%.3f' % (d)) df[mean] = df[mean].apply(lambda d: '%.3f' % (d)) df[c] = (df[mean] + df[std]).apply(_handle_nan) df['dataset'] = df['dataset'].map(const.competitions) df['model'] = df['model'].map(const.models) df = df[['dataset', 'model', 'duration[s]', 'CV AUC']] df = df.sort_values(['dataset', 'model']) df = df.reset_index(drop=True) df['model'] = df['model'].apply(lambda d: d[1]) for dset, grp in df.groupby('dataset'): grp.pop('dataset') md = tabulate(grp.values, grp.columns, tablefmt='pipe', floatfmt='.3f') print('#### %s\n' % (dset)) print(md + '\n')
Example #10
Source File: io.py From EarthSim with BSD 3-Clause "New" or "Revised" License | 5 votes |
def open_gssha(filename): """ Reads various filetypes produced by GSSHA """ # Read metadata ftype = filename.split('.')[-1] if ftype in ['fgd', 'asc']: f = open(filename, 'r') c, r, xlc, ylc, gsize, nanval = [ t(f.readline().split(' ')[-1].split('\n')[0]) for t in [int, int, float, float, float, float] ] xs = np.linspace(xlc+gsize/2., xlc+c*gsize-gsize/2., c+1) ys = np.linspace(ylc+gsize/2., ylc+r*gsize-gsize/2., r) else: header_df = pd.read_table(filename, engine='python', names=['meta_key', 'meta_val'], sep=' ', nrows=6) bounds = header_df.loc[:3, 'meta_val'].values.astype(float) r, c = header_df.loc[4:6, 'meta_val'].values.astype(int) xs, ys = get_sampling(bounds, (r, c)) # Read data using dask ddf = dd.read_csv(filename, skiprows=6, header=None, sep=' ') darr = ddf.values.compute() if ftype == 'fgd': darr[darr==nanval] = np.NaN return xr.DataArray(darr[::-1], coords={'x': xs, 'y': ys}, name='z', dims=['y', 'x'])
Example #11
Source File: vis_model_and_task.py From autogbt-alt with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True) args = parser.parse_args() data_dir = Path(args.input) path = str(data_dir/'*.csv') df = dd.read_csv(path).compute() df['model'] = df['model'].map(const.models) df = df.sort_values(['dataset', 'model']).reset_index(drop=True) df = df.groupby(['model', 'dataset']).agg({ 'CV AUC': ['mean', 'std'], 'duration[s]': ['mean', 'std'], }) df.columns = ['%s_%s' % (a, b) for a, b in df.columns] df = df.reset_index() df['model'] = df['model'].apply(lambda d: d[1]) print(df) # plot plt.figure(figsize=(8, 6)) for i, (_, model) in enumerate(const.models.values()): for j, dset in enumerate(['airline', 'amazon', 'bank']): idx = (df['model'] == model) &\ (df['dataset'] == dset) x = df.loc[idx, 'duration[s]_mean'] y = df.loc[idx, 'CV AUC_mean'] xerr = df.loc[idx, 'duration[s]_std'] yerr = df.loc[idx, 'CV AUC_std'] fmt = '%sC%d' % (['o', 's', 'D', '^'][j], i) label = 'model=%s, dataset=%s' % (model, dset) # noqa plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label) plt.title('Model Comparison') plt.xlabel('Training Time[s]') plt.ylabel('CV AUC') plt.legend(loc='lower right') plt.savefig(data_dir/'model_and_task.png')
Example #12
Source File: vis_frac_and_duration.py From autogbt-alt with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True) args = parser.parse_args() data_dir = Path(args.input) name = data_dir.parent.name path = str(data_dir/'*.csv') df = dd.read_csv(path).compute() df = df.groupby(['model', 'n_trials', 'model_train_frac']).agg({ 'CV AUC': ['mean', 'std'], 'duration[s]': ['mean', 'std'], }) df.columns = ['%s_%s' % (a, b) for a, b in df.columns] df = df.reset_index() print(df) # plot fracs = sorted(df['model_train_frac'].unique()) plt.figure(figsize=(12, 5)) for j, frac in enumerate(fracs): for i, n_trials in enumerate([1, 10, 20, 30]): idx = (df['model'] == 'auto') &\ (df['n_trials'] == n_trials) &\ (df['model_train_frac'] == frac) x = df.loc[idx, 'duration[s]_mean'] y = df.loc[idx, 'CV AUC_mean'] xerr = df.loc[idx, 'duration[s]_std'] yerr = df.loc[idx, 'CV AUC_std'] fmt = '%sC%d' % (['x', 'o', 's', 'D'][i], j) label = 'n_trials=%d, model_train_frac=%.2f' % (n_trials, frac) # noqa plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label) plt.title('Parameter Comparison (dataset=%s)' % (name)) plt.xlabel('Training Time[s]') plt.ylabel('CV AUC') plt.legend() plt.savefig(str(data_dir/'frac-and-n_trials.png'))
Example #13
Source File: viirs_edr_active_fires.py From satpy with GNU General Public License v3.0 | 5 votes |
def __init__(self, filename, filename_info, filetype_info): """Make sure filepath is valid and then reads data into a Dask DataFrame. Args: filename: Filename filename_info: Filename information filetype_info: Filetype information """ skip_rows = filetype_info.get('skip_rows', 15) columns = filetype_info['columns'] self.file_content = dd.read_csv(filename, skiprows=skip_rows, header=None, names=columns) super(VIIRSActiveFiresTextFileHandler, self).__init__(filename, filename_info, filetype_info) self.platform_name = PLATFORM_MAP.get(self.filename_info['satellite_name'].upper(), "unknown")
Example #14
Source File: test_fit_predict.py From dask-lightgbm with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_classify_newsread(client, listen_port): data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None) dX = data.iloc[:, :-1] dy = data.iloc[:, -1] d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=listen_port) d_classif.fit(dX, dy) dy_pred = d_classif.predict(dX, client=client) acc_score = (dy == dy_pred).sum() / len(dy) acc_score = acc_score.compute() print(acc_score) assert acc_score > 0.8
Example #15
Source File: utils.py From pystore with Apache License 2.0 | 5 votes |
def read_csv(urlpath, *args, **kwargs): def rename_dask_index(df, name): df.index.name = name return df index_col = index_name = None if "index" in kwargs: del kwargs["index"] if "index_col" in kwargs: index_col = kwargs["index_col"] if isinstance(index_col, list): index_col = index_col[0] del kwargs["index_col"] if "index_name" in kwargs: index_name = kwargs["index_name"] del kwargs["index_name"] df = dd.read_csv(urlpath, *args, **kwargs) if index_col is not None: df = df.set_index(index_col) if index_name is not None: df = df.map_partitions(rename_dask_index, index_name) return df
Example #16
Source File: ml_100k.py From recommender-tensorflow with MIT License | 5 votes |
def load_data(src_dir="data/ml-100k"): data = {item: dd.read_csv(str(Path(src_dir, conf["filename"])), sep=conf["sep"], header=None, names=conf["columns"], encoding="latin-1") for item, conf in DATA_CONFIG.items()} logger.info("data loaded.") return data
Example #17
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 4 votes |
def _parse_csv(self): """Reads csv files in dask to determine the datatypes and other features about data this helps in creating a dataset object in tensorflow Returns: df : dask dataframe, parsed dataframe object list(df.columns) : list, list of column names """ if self.gcs_path: if isinstance(self.csv_path, list): for index, path in enumerate(self.csv_path): parse_result = urlparse(path) bucket = parse_result.hostname csv_name = parse_result.path self._download_csv( bucket, csv_name, path_name='/tmp/data_' + str(index) + '.csv') csv_path = '/tmp/data_*.csv' else: parse_result = urlparse(self.csv_path) bucket = parse_result.hostname csv_name = parse_result.path self._download_csv(bucket, csv_name) csv_path = '/tmp/data.csv' else: csv_path = self.csv_path if self.column_names: header = None else: header = 'infer' try: df = dd.read_csv( csv_path, names=self.column_names, header=header, na_values=self.na_values, sample=12800000, dtype=self.data_type) if isinstance(csv_path, list): len(df) # Checks whether schema is consistent throughout the data except Exception: raise AssertionError( 'Data types given are inconsistent with data provided') if self.to_drop is not None: drop_column_names = self.to_drop drop_column_names = [ name for name in drop_column_names if name in df.columns] df = self.drop_cols(df, drop_column_names) tf.logging.info('Dropping the columns : %s', drop_column_names) return df, list(df.columns)
Example #18
Source File: tests-main.py From d6tpipe with MIT License | 4 votes |
def test_intro_stat_learning(self, cleanup, signup, testcfg): cfg_name = cfg_settings_islr['name'] cfg_filenames_islr = ['Advertising.csv', 'Advertising2.csv', 'Auto.csv', 'Ch10Ex11.csv', 'College.csv', 'Credit.csv', 'Heart.csv', 'Income1.csv', 'Income2.csv', 'LICENSE.md', 'README.md'] # start with local repo pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname) pipelocal.delete_files_local(confirm=False,delete_all=False) pipelocal.import_dir('tests/intro-stat-learning/') assert pipelocal.scan_local() == cfg_filenames_islr assert pipelocal.files() == [] assert pipelocal.files(fromdb=False) == cfg_filenames_islr df = pd.read_csv(pipelocal.dirpath/'Advertising.csv') assert not df.empty if not testcfg.get('local', False): # set up public repo api = getapi() d6tpipe.upsert_pipe(api, cfg_settings_islr) d6tpipe.upsert_permissions(api, cfg_name, {"username": 'public', "role": "read"}) pipe = d6tpipe.Pipe(api,cfg_name,mode='all') pipe.delete_files_remote(confirm=False) assert pipe.scan_remote(cached=False) == [] assert pipe.push()==cfg_filenames_islr pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname) assert len(pipelocal.schema)>0 api2 = getapi2() pipe = d6tpipe.Pipe(api2,cfg_name) pipe.delete_files_local(confirm=False, delete_all=False) assert pipe.pull()==cfg_filenames_islr df = pd.read_csv(pipe.dirpath / 'Advertising.csv', **pipe.schema['pandas']) assert not df.empty import dask.dataframe as dd files = pipe.filepaths(include='Advertising*.csv') ddf = dd.read_csv(files, **pipe.schema['dask']) assert not ddf.compute().empty pipe.delete_files_local(confirm=False, delete_all=False) pipelocal.delete_files_local(confirm=False,delete_all=True)
Example #19
Source File: tests-main.py From d6tpipe with MIT License | 4 votes |
def test_pipes_pull(self, cleanup, signup, parentinit, pipeinit, testcfg): api = getapi(testcfg.get('local',False)) pipe = getpipe(api) assert pipe.name in api.list_pipes() cfg_chk_crc = ['8a9782e9efa8befa9752045ca506a62e', '5fe579d6b71031dad399e8d4ea82820b', '4c7da169df85253d7ff737dde1e7400b', 'ca62a122993494e763fd1676cce95e76'] # assert False assert pipe.files() == [] assert pipe.scan_remote() == cfg_filenames_chk r, d = pipe.scan_remote(attributes=True) assert _filenames(d) == cfg_filenames_chk assert [o['crc'] for o in d]==cfg_chk_crc assert api.list_local_pipes()==[] assert pipe.pull_preview() == cfg_filenames_chk assert pipe.pull() == cfg_filenames_chk assert pipe.pull_preview() == [] assert api.list_local_pipes()==[pipe.name] assert pipe.files() == cfg_filenames_chk assert pipe.filepaths() == [Path(pipe.dirpath)/f for f in pipe.files()] assert pipe.filepaths(aspathlib=False) == [str(Path(pipe.dirpath)/f) for f in pipe.files()] pipe = getpipe(api, chk_empty=False, mode='all') assert pipe.pull_preview() == cfg_filenames_chk # PipeLocal pipelocal = d6tpipe.PipeLocal(pipe.name,profile=cfg_profile, filecfg=cfg_cfgfname) assert pipelocal.files() == cfg_filenames_chk assert pipelocal.scan_local() == cfg_filenames_chk assert pipelocal.schema == cfg_settings_pipe['schema'] df = pd.read_csv(pipe.dirpath/cfg_filenames_chk[0], **pipe.schema['pandas']) # permissions if not testcfg.get('local',False): api2 = getapi2(testcfg.get('local', False)) with pytest.raises(APIError, match='403'): pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all') pipe2.pull() settings = {"username": cfg_usr2, "role": "read"} r,d = d6tpipe.upsert_permissions(api, cfg_parent_name, settings) pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all') assert pipe2.pull()==cfg_filenames_chk # cleanup pipe.delete_files_local(confirm=False,delete_all=True)