Python dask.dataframe.read_csv() Examples

The following are 19 code examples of dask.dataframe.read_csv(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module dask.dataframe , or try the search function .
Example #1
Source File: tf_utils.py    From recommender-tensorflow with MIT License 6 votes vote down vote up
def tf_csv_dataset(csv_path, label_col, col_defaults, shuffle=False, batch_size=32):
    df = dd.read_csv(csv_path)
    # use col_defaults if specified for col, else use defaults base on col type
    type_defaults = {np.int64: 0, np.float64: 0.0, np.object_: ""}
    record_defaults = [[col_defaults.get(col_name, type_defaults.get(col_type.type, ""))]
                       for col_name, col_type in df.dtypes.items()]

    def parse_csv(value):
        columns = tf.decode_csv(value, record_defaults)
        features = dict(zip(df.columns.tolist(), columns))
        label = features[label_col]
        return features, label

    # read, parse, shuffle and batch dataset
    dataset = tf.data.TextLineDataset(csv_path).skip(1)  # skip header
    if shuffle:
        dataset = dataset.shuffle(buffer_size=1024)
    dataset = dataset.map(parse_csv, num_parallel_calls=8)
    dataset = dataset.batch(batch_size)
    return dataset 
Example #2
Source File: data_loaders.py    From nlp-recipes with MIT License 6 votes vote down vote up
def __init__(self, file_path, block_size=10e6, random_seed=None, lines=True):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
            lines (bool, optional): Read the file as a json object per line. Defaults to True.
        """

        self.df = dd.read_json(file_path, blocksize=block_size, lines=lines)

        self.random_seed = random_seed
        random.seed(random_seed) 
Example #3
Source File: NucleiClassification.py    From HistomicsTK with Apache License 2.0 6 votes vote down vote up
def read_feature_file(args):

    fname, feature_file_format = os.path.splitext(args.inputNucleiFeatureFile)

    if feature_file_format == '.csv':

        ddf = dd.read_csv(args.inputNucleiFeatureFile)

    elif feature_file_format == '.h5':

        ddf = dd.read_hdf(args.inputNucleiFeatureFile, 'Features')

    else:
        raise ValueError('Extension of output feature file must be .csv or .h5')

    return ddf 
Example #4
Source File: update_landsat_metadata.py    From Landsat578 with Apache License 2.0 6 votes vote down vote up
def split_list(_list=LATEST):

    print('Please wait while scene metadata is split')
    try:
        csv = read_csv(_list, dtype={'PRODUCT_ID': object, 'COLLECTION_NUMBER': object,
                                    'COLLECTION_CATEGORY': object}, blocksize=25e6,
                    parse_dates=True)
    except EmptyDataError:
        print('Metadata has already been updated for the day.')
        return None

    csv = csv[csv.COLLECTION_NUMBER != 'PRE']

    sats = unique(csv.SPACECRAFT_ID).tolist()
    for sat in sats:
        print(sat)
        df = csv[csv.SPACECRAFT_ID == sat]
        dst = os.path.join(SCENES, sat)
        if os.path.isfile(dst):
            os.remove(dst)
        if not os.path.isdir(dst):
            os.mkdir(dst)
        df.to_parquet('{}'.format(dst))

    return None 
Example #5
Source File: data_loaders.py    From nlp-recipes with MIT License 6 votes vote down vote up
def __init__(self, file_path, sep=",", header="infer", block_size=10e6, random_seed=None):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            sep (str, optional): Delimiter. Defaults to ",".
            header (str, optional): Number of rows to be used as the header.
                See pandas.read_csv()
                Defaults to "infer".
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
        """

        self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size)

        self.random_seed = random_seed
        random.seed(random_seed) 
Example #6
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def kmeans_input_fn(self, name, csv_path=None):
        """Input function for kmeans

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                A batch of features
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        df = dd.read_csv(pattern)
        vectors = dask.compute(df.values)
        return tf.train.limit_epochs(
            tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1) 
Example #7
Source File: make_parquet.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def main():
    client = Client()  # noqa

    categories = ["category_%d" % i for i in range(26)]
    columns = ["click"] + ["numeric_%d" % i for i in range(13)] + categories

    df = dd.read_csv("day_1", sep="\t", names=columns, header=None)

    encoding = {c: "bytes" for c in categories}
    fixed = {c: 8 for c in categories}
    df.to_parquet(
        "day-1-bytes.parquet",
        object_encoding=encoding,
        fixed_text=fixed,
        compression="SNAPPY",
    ) 
Example #8
Source File: test_fit_predict.py    From dask-lightgbm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_regress_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, 1:]
    dy = data.iloc[:, 0]

    d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=listen_port)
    d_regress.fit(dX, dy)

    dy_pred = d_regress.predict(dX, client=client)

    # The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves
    numerator = ((dy - dy_pred) ** 2).sum()
    denominator = ((dy - dy.mean()) ** 2).sum()
    r2_score = 1 - numerator / denominator
    r2_score = r2_score.compute()
    print(r2_score)

    assert r2_score > 0.8 
Example #9
Source File: print.py    From autogbt-alt with MIT License 5 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    path = str(Path(args.input)/'*.csv')
    df = dd.read_csv(
        path,
        dtype={
            'n_trials': 'float64',
        },
    ).compute()
    df = df.groupby(['model', 'dataset']).agg({
        'duration[s]': ['mean', 'std'],
        'CV AUC': ['mean', 'std'],
    })
    dummy = pd.DataFrame(index=[
        ('xgb', 'avazu'),
        ('lgb', 'avazu'),
    ])
    df = df.append(dummy)
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    for c in ['duration[s]', 'CV AUC']:
        mean = '%s_mean' % (c)
        std = '%s_std' % (c)
        df[std] = df[std].apply(lambda d: '±%.3f' % (d))
        df[mean] = df[mean].apply(lambda d: '%.3f' % (d))
        df[c] = (df[mean] + df[std]).apply(_handle_nan)
    df['dataset'] = df['dataset'].map(const.competitions)
    df['model'] = df['model'].map(const.models)
    df = df[['dataset', 'model', 'duration[s]', 'CV AUC']]
    df = df.sort_values(['dataset', 'model'])
    df = df.reset_index(drop=True)
    df['model'] = df['model'].apply(lambda d: d[1])

    for dset, grp in df.groupby('dataset'):
        grp.pop('dataset')
        md = tabulate(grp.values, grp.columns, tablefmt='pipe', floatfmt='.3f')
        print('#### %s\n' % (dset))
        print(md + '\n') 
Example #10
Source File: io.py    From EarthSim with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def open_gssha(filename):
    """
    Reads various filetypes produced by GSSHA
    """
    # Read metadata
    ftype = filename.split('.')[-1]
    if ftype in ['fgd', 'asc']:
        f = open(filename, 'r')
        c, r, xlc, ylc, gsize, nanval = [
            t(f.readline().split(' ')[-1].split('\n')[0])
            for t in [int, int, float, float, float, float]
        ]
        xs = np.linspace(xlc+gsize/2., xlc+c*gsize-gsize/2., c+1)
        ys = np.linspace(ylc+gsize/2., ylc+r*gsize-gsize/2., r)
    else:
        header_df = pd.read_table(filename, engine='python',
                              names=['meta_key', 'meta_val'],
                              sep=' ', nrows=6)
        bounds = header_df.loc[:3, 'meta_val'].values.astype(float)
        r, c = header_df.loc[4:6, 'meta_val'].values.astype(int)
        xs, ys = get_sampling(bounds, (r, c))
    
    # Read data using dask
    ddf = dd.read_csv(filename, skiprows=6, header=None,
                      sep=' ')
    darr = ddf.values.compute()
        
    if ftype == 'fgd':
        darr[darr==nanval] = np.NaN
    
    return xr.DataArray(darr[::-1], coords={'x': xs, 'y': ys},
                        name='z', dims=['y', 'x']) 
Example #11
Source File: vis_model_and_task.py    From autogbt-alt with MIT License 5 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df['model'] = df['model'].map(const.models)
    df = df.sort_values(['dataset', 'model']).reset_index(drop=True)
    df = df.groupby(['model', 'dataset']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    df['model'] = df['model'].apply(lambda d: d[1])
    print(df)

    # plot
    plt.figure(figsize=(8, 6))
    for i, (_, model) in enumerate(const.models.values()):
        for j, dset in enumerate(['airline', 'amazon', 'bank']):
            idx = (df['model'] == model) &\
                  (df['dataset'] == dset)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['o', 's', 'D', '^'][j], i)
            label = 'model=%s, dataset=%s' % (model, dset)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Model Comparison')
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend(loc='lower right')
    plt.savefig(data_dir/'model_and_task.png') 
Example #12
Source File: vis_frac_and_duration.py    From autogbt-alt with MIT License 5 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    name = data_dir.parent.name
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df = df.groupby(['model', 'n_trials', 'model_train_frac']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    print(df)

    # plot
    fracs = sorted(df['model_train_frac'].unique())
    plt.figure(figsize=(12, 5))
    for j, frac in enumerate(fracs):
        for i, n_trials in enumerate([1, 10, 20, 30]):
            idx = (df['model'] == 'auto') &\
                  (df['n_trials'] == n_trials) &\
                  (df['model_train_frac'] == frac)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['x', 'o', 's', 'D'][i], j)
            label = 'n_trials=%d, model_train_frac=%.2f' % (n_trials, frac)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Parameter Comparison (dataset=%s)' % (name))
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend()
    plt.savefig(str(data_dir/'frac-and-n_trials.png')) 
Example #13
Source File: viirs_edr_active_fires.py    From satpy with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, filename, filename_info, filetype_info):
        """Make sure filepath is valid and then reads data into a Dask DataFrame.

        Args:
            filename: Filename
            filename_info: Filename information
            filetype_info: Filetype information

        """
        skip_rows = filetype_info.get('skip_rows', 15)
        columns = filetype_info['columns']
        self.file_content = dd.read_csv(filename, skiprows=skip_rows, header=None, names=columns)
        super(VIIRSActiveFiresTextFileHandler, self).__init__(filename, filename_info, filetype_info)
        self.platform_name = PLATFORM_MAP.get(self.filename_info['satellite_name'].upper(), "unknown") 
Example #14
Source File: test_fit_predict.py    From dask-lightgbm with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_classify_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, :-1]
    dy = data.iloc[:, -1]

    d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=listen_port)
    d_classif.fit(dX, dy)

    dy_pred = d_classif.predict(dX, client=client)

    acc_score = (dy == dy_pred).sum() / len(dy)
    acc_score = acc_score.compute()
    print(acc_score)

    assert acc_score > 0.8 
Example #15
Source File: utils.py    From pystore with Apache License 2.0 5 votes vote down vote up
def read_csv(urlpath, *args, **kwargs):
    def rename_dask_index(df, name):
        df.index.name = name
        return df

    index_col = index_name = None

    if "index" in kwargs:
        del kwargs["index"]
    if "index_col" in kwargs:
        index_col = kwargs["index_col"]
        if isinstance(index_col, list):
            index_col = index_col[0]
        del kwargs["index_col"]
    if "index_name" in kwargs:
        index_name = kwargs["index_name"]
        del kwargs["index_name"]

    df = dd.read_csv(urlpath, *args, **kwargs)

    if index_col is not None:
        df = df.set_index(index_col)

    if index_name is not None:
        df = df.map_partitions(rename_dask_index, index_name)

    return df 
Example #16
Source File: ml_100k.py    From recommender-tensorflow with MIT License 5 votes vote down vote up
def load_data(src_dir="data/ml-100k"):
    data = {item: dd.read_csv(str(Path(src_dir, conf["filename"])), sep=conf["sep"],
                              header=None, names=conf["columns"], encoding="latin-1")
            for item, conf in DATA_CONFIG.items()}

    logger.info("data loaded.")
    return data 
Example #17
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def _parse_csv(self):
        """Reads csv files in dask to determine the datatypes and other features about data
        this helps in creating a dataset object in tensorflow

        Returns:
                df : dask dataframe, parsed dataframe object
                list(df.columns) : list, list of column names
        """
        if self.gcs_path:
            if isinstance(self.csv_path, list):
                for index, path in enumerate(self.csv_path):
                    parse_result = urlparse(path)
                    bucket = parse_result.hostname
                    csv_name = parse_result.path
                    self._download_csv(
                        bucket,
                        csv_name,
                        path_name='/tmp/data_' +
                                  str(index) +
                                  '.csv')
                csv_path = '/tmp/data_*.csv'
            else:
                parse_result = urlparse(self.csv_path)
                bucket = parse_result.hostname
                csv_name = parse_result.path
                self._download_csv(bucket, csv_name)
                csv_path = '/tmp/data.csv'
        else:
            csv_path = self.csv_path

        if self.column_names:
            header = None
        else:
            header = 'infer'

        try:
            df = dd.read_csv(
                csv_path,
                names=self.column_names,
                header=header,
                na_values=self.na_values,
                sample=12800000,
                dtype=self.data_type)
            if isinstance(csv_path, list):
                len(df)  # Checks whether schema is consistent throughout the data
        except Exception:
            raise AssertionError(
                'Data types given are inconsistent with data provided')

        if self.to_drop is not None:
            drop_column_names = self.to_drop
            drop_column_names = [
                name for name in drop_column_names if name in df.columns]
            df = self.drop_cols(df, drop_column_names)
            tf.logging.info('Dropping the columns : %s', drop_column_names)

        return df, list(df.columns) 
Example #18
Source File: tests-main.py    From d6tpipe with MIT License 4 votes vote down vote up
def test_intro_stat_learning(self, cleanup, signup, testcfg):
        cfg_name = cfg_settings_islr['name']
        cfg_filenames_islr = ['Advertising.csv', 'Advertising2.csv', 'Auto.csv', 'Ch10Ex11.csv', 'College.csv', 'Credit.csv', 'Heart.csv', 'Income1.csv', 'Income2.csv', 'LICENSE.md', 'README.md']

        # start with local repo
        pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname)
        pipelocal.delete_files_local(confirm=False,delete_all=False)
        pipelocal.import_dir('tests/intro-stat-learning/')
        assert pipelocal.scan_local() == cfg_filenames_islr
        assert pipelocal.files() == []
        assert pipelocal.files(fromdb=False) == cfg_filenames_islr

        df = pd.read_csv(pipelocal.dirpath/'Advertising.csv')
        assert not df.empty

        if not testcfg.get('local', False):
            # set up public repo
            api = getapi()
            d6tpipe.upsert_pipe(api, cfg_settings_islr)
            d6tpipe.upsert_permissions(api, cfg_name, {"username": 'public', "role": "read"})
            pipe = d6tpipe.Pipe(api,cfg_name,mode='all')
            pipe.delete_files_remote(confirm=False)
            assert pipe.scan_remote(cached=False) == []
            assert pipe.push()==cfg_filenames_islr
            pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname)
            assert len(pipelocal.schema)>0

            api2 = getapi2()
            pipe = d6tpipe.Pipe(api2,cfg_name)
            pipe.delete_files_local(confirm=False, delete_all=False)
            assert pipe.pull()==cfg_filenames_islr

            df = pd.read_csv(pipe.dirpath / 'Advertising.csv', **pipe.schema['pandas'])
            assert not df.empty

            import dask.dataframe as dd
            files = pipe.filepaths(include='Advertising*.csv')
            ddf = dd.read_csv(files, **pipe.schema['dask'])
            assert not ddf.compute().empty
            pipe.delete_files_local(confirm=False, delete_all=False)

        pipelocal.delete_files_local(confirm=False,delete_all=True) 
Example #19
Source File: tests-main.py    From d6tpipe with MIT License 4 votes vote down vote up
def test_pipes_pull(self, cleanup, signup, parentinit, pipeinit, testcfg):
        api = getapi(testcfg.get('local',False))
        pipe = getpipe(api)
        assert pipe.name in api.list_pipes()

        cfg_chk_crc = ['8a9782e9efa8befa9752045ca506a62e',
         '5fe579d6b71031dad399e8d4ea82820b',
         '4c7da169df85253d7ff737dde1e7400b',
         'ca62a122993494e763fd1676cce95e76']

        # assert False
        assert pipe.files() == []
        assert pipe.scan_remote() == cfg_filenames_chk
        r, d = pipe.scan_remote(attributes=True)
        assert _filenames(d) == cfg_filenames_chk
        assert [o['crc'] for o in d]==cfg_chk_crc

        assert api.list_local_pipes()==[]
        assert pipe.pull_preview() == cfg_filenames_chk
        assert pipe.pull() == cfg_filenames_chk
        assert pipe.pull_preview() == []
        assert api.list_local_pipes()==[pipe.name]

        assert pipe.files() == cfg_filenames_chk
        assert pipe.filepaths() == [Path(pipe.dirpath)/f for f in pipe.files()]
        assert pipe.filepaths(aspathlib=False) == [str(Path(pipe.dirpath)/f) for f in pipe.files()]

        pipe = getpipe(api, chk_empty=False, mode='all')
        assert pipe.pull_preview() == cfg_filenames_chk

        # PipeLocal
        pipelocal = d6tpipe.PipeLocal(pipe.name,profile=cfg_profile, filecfg=cfg_cfgfname)
        assert pipelocal.files() == cfg_filenames_chk
        assert pipelocal.scan_local() == cfg_filenames_chk
        assert pipelocal.schema == cfg_settings_pipe['schema']
        df = pd.read_csv(pipe.dirpath/cfg_filenames_chk[0], **pipe.schema['pandas'])

        # permissions
        if not testcfg.get('local',False):
            api2 = getapi2(testcfg.get('local', False))
            with pytest.raises(APIError, match='403'):
                pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
                pipe2.pull()

            settings = {"username": cfg_usr2, "role": "read"}
            r,d = d6tpipe.upsert_permissions(api, cfg_parent_name, settings)

            pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
            assert pipe2.pull()==cfg_filenames_chk

        # cleanup
        pipe.delete_files_local(confirm=False,delete_all=True)