Python Examples of dask.dataframe.read

Source File: tf_utils.py From recommender-tensorflow with MIT License

6 votes

def tf_csv_dataset(csv_path, label_col, col_defaults, shuffle=False, batch_size=32):
    df = dd.read_csv(csv_path)
    # use col_defaults if specified for col, else use defaults base on col type
    type_defaults = {np.int64: 0, np.float64: 0.0, np.object_: ""}
    record_defaults = [[col_defaults.get(col_name, type_defaults.get(col_type.type, ""))]
                       for col_name, col_type in df.dtypes.items()]

    def parse_csv(value):
        columns = tf.decode_csv(value, record_defaults)
        features = dict(zip(df.columns.tolist(), columns))
        label = features[label_col]
        return features, label

    # read, parse, shuffle and batch dataset
    dataset = tf.data.TextLineDataset(csv_path).skip(1)  # skip header
    if shuffle:
        dataset = dataset.shuffle(buffer_size=1024)
    dataset = dataset.map(parse_csv, num_parallel_calls=8)
    dataset = dataset.batch(batch_size)
    return dataset

Source File: data_loaders.py From nlp-recipes with MIT License

6 votes

def __init__(self, file_path, block_size=10e6, random_seed=None, lines=True):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
            lines (bool, optional): Read the file as a json object per line. Defaults to True.
        """

        self.df = dd.read_json(file_path, blocksize=block_size, lines=lines)

        self.random_seed = random_seed
        random.seed(random_seed)

Source File: NucleiClassification.py From HistomicsTK with Apache License 2.0

6 votes

def read_feature_file(args):

    fname, feature_file_format = os.path.splitext(args.inputNucleiFeatureFile)

    if feature_file_format == '.csv':

        ddf = dd.read_csv(args.inputNucleiFeatureFile)

    elif feature_file_format == '.h5':

        ddf = dd.read_hdf(args.inputNucleiFeatureFile, 'Features')

    else:
        raise ValueError('Extension of output feature file must be .csv or .h5')

    return ddf

Source File: update_landsat_metadata.py From Landsat578 with Apache License 2.0

6 votes

def split_list(_list=LATEST):

    print('Please wait while scene metadata is split')
    try:
        csv = read_csv(_list, dtype={'PRODUCT_ID': object, 'COLLECTION_NUMBER': object,
                                    'COLLECTION_CATEGORY': object}, blocksize=25e6,
                    parse_dates=True)
    except EmptyDataError:
        print('Metadata has already been updated for the day.')
        return None

    csv = csv[csv.COLLECTION_NUMBER != 'PRE']

    sats = unique(csv.SPACECRAFT_ID).tolist()
    for sat in sats:
        print(sat)
        df = csv[csv.SPACECRAFT_ID == sat]
        dst = os.path.join(SCENES, sat)
        if os.path.isfile(dst):
            os.remove(dst)
        if not os.path.isdir(dst):
            os.mkdir(dst)
        df.to_parquet('{}'.format(dst))

    return None

Source File: data_loaders.py From nlp-recipes with MIT License

6 votes

def __init__(self, file_path, sep=",", header="infer", block_size=10e6, random_seed=None):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            sep (str, optional): Delimiter. Defaults to ",".
            header (str, optional): Number of rows to be used as the header.
                See pandas.read_csv()
                Defaults to "infer".
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
        """

        self.df = dd.read_csv(file_path, sep=sep, header=header, blocksize=block_size)

        self.random_seed = random_seed
        random.seed(random_seed)

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

6 votes

def kmeans_input_fn(self, name, csv_path=None):
        """Input function for kmeans

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                A batch of features
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        df = dd.read_csv(pattern)
        vectors = dask.compute(df.values)
        return tf.train.limit_epochs(
            tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1)

Source File: make_parquet.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def main():
    client = Client()  # noqa

    categories = ["category_%d" % i for i in range(26)]
    columns = ["click"] + ["numeric_%d" % i for i in range(13)] + categories

    df = dd.read_csv("day_1", sep="\t", names=columns, header=None)

    encoding = {c: "bytes" for c in categories}
    fixed = {c: 8 for c in categories}
    df.to_parquet(
        "day-1-bytes.parquet",
        object_encoding=encoding,
        fixed_text=fixed,
        compression="SNAPPY",
    )

Source File: test_fit_predict.py From dask-lightgbm with BSD 3-Clause "New" or "Revised" License

6 votes

def test_regress_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, 1:]
    dy = data.iloc[:, 0]

    d_regress = dlgbm.LGBMRegressor(n_estimators=50, local_listen_port=listen_port)
    d_regress.fit(dX, dy)

    dy_pred = d_regress.predict(dX, client=client)

    # The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves
    numerator = ((dy - dy_pred) ** 2).sum()
    denominator = ((dy - dy.mean()) ** 2).sum()
    r2_score = 1 - numerator / denominator
    r2_score = r2_score.compute()
    print(r2_score)

    assert r2_score > 0.8

Source File: print.py From autogbt-alt with MIT License

5 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    path = str(Path(args.input)/'*.csv')
    df = dd.read_csv(
        path,
        dtype={
            'n_trials': 'float64',
        },
    ).compute()
    df = df.groupby(['model', 'dataset']).agg({
        'duration[s]': ['mean', 'std'],
        'CV AUC': ['mean', 'std'],
    })
    dummy = pd.DataFrame(index=[
        ('xgb', 'avazu'),
        ('lgb', 'avazu'),
    ])
    df = df.append(dummy)
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    for c in ['duration[s]', 'CV AUC']:
        mean = '%s_mean' % (c)
        std = '%s_std' % (c)
        df[std] = df[std].apply(lambda d: '±%.3f' % (d))
        df[mean] = df[mean].apply(lambda d: '%.3f' % (d))
        df[c] = (df[mean] + df[std]).apply(_handle_nan)
    df['dataset'] = df['dataset'].map(const.competitions)
    df['model'] = df['model'].map(const.models)
    df = df[['dataset', 'model', 'duration[s]', 'CV AUC']]
    df = df.sort_values(['dataset', 'model'])
    df = df.reset_index(drop=True)
    df['model'] = df['model'].apply(lambda d: d[1])

    for dset, grp in df.groupby('dataset'):
        grp.pop('dataset')
        md = tabulate(grp.values, grp.columns, tablefmt='pipe', floatfmt='.3f')
        print('#### %s\n' % (dset))
        print(md + '\n')

Source File: io.py From EarthSim with BSD 3-Clause "New" or "Revised" License

5 votes

def open_gssha(filename):
    """
    Reads various filetypes produced by GSSHA
    """
    # Read metadata
    ftype = filename.split('.')[-1]
    if ftype in ['fgd', 'asc']:
        f = open(filename, 'r')
        c, r, xlc, ylc, gsize, nanval = [
            t(f.readline().split(' ')[-1].split('\n')[0])
            for t in [int, int, float, float, float, float]
        ]
        xs = np.linspace(xlc+gsize/2., xlc+c*gsize-gsize/2., c+1)
        ys = np.linspace(ylc+gsize/2., ylc+r*gsize-gsize/2., r)
    else:
        header_df = pd.read_table(filename, engine='python',
                              names=['meta_key', 'meta_val'],
                              sep=' ', nrows=6)
        bounds = header_df.loc[:3, 'meta_val'].values.astype(float)
        r, c = header_df.loc[4:6, 'meta_val'].values.astype(int)
        xs, ys = get_sampling(bounds, (r, c))
    
    # Read data using dask
    ddf = dd.read_csv(filename, skiprows=6, header=None,
                      sep=' ')
    darr = ddf.values.compute()
        
    if ftype == 'fgd':
        darr[darr==nanval] = np.NaN
    
    return xr.DataArray(darr[::-1], coords={'x': xs, 'y': ys},
                        name='z', dims=['y', 'x'])

Source File: vis_model_and_task.py From autogbt-alt with MIT License

5 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df['model'] = df['model'].map(const.models)
    df = df.sort_values(['dataset', 'model']).reset_index(drop=True)
    df = df.groupby(['model', 'dataset']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    df['model'] = df['model'].apply(lambda d: d[1])
    print(df)

    # plot
    plt.figure(figsize=(8, 6))
    for i, (_, model) in enumerate(const.models.values()):
        for j, dset in enumerate(['airline', 'amazon', 'bank']):
            idx = (df['model'] == model) &\
                  (df['dataset'] == dset)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['o', 's', 'D', '^'][j], i)
            label = 'model=%s, dataset=%s' % (model, dset)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Model Comparison')
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend(loc='lower right')
    plt.savefig(data_dir/'model_and_task.png')

Source File: vis_frac_and_duration.py From autogbt-alt with MIT License

5 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True)
    args = parser.parse_args()

    data_dir = Path(args.input)
    name = data_dir.parent.name
    path = str(data_dir/'*.csv')
    df = dd.read_csv(path).compute()
    df = df.groupby(['model', 'n_trials', 'model_train_frac']).agg({
        'CV AUC': ['mean', 'std'],
        'duration[s]': ['mean', 'std'],
    })
    df.columns = ['%s_%s' % (a, b) for a, b in df.columns]
    df = df.reset_index()
    print(df)

    # plot
    fracs = sorted(df['model_train_frac'].unique())
    plt.figure(figsize=(12, 5))
    for j, frac in enumerate(fracs):
        for i, n_trials in enumerate([1, 10, 20, 30]):
            idx = (df['model'] == 'auto') &\
                  (df['n_trials'] == n_trials) &\
                  (df['model_train_frac'] == frac)
            x = df.loc[idx, 'duration[s]_mean']
            y = df.loc[idx, 'CV AUC_mean']
            xerr = df.loc[idx, 'duration[s]_std']
            yerr = df.loc[idx, 'CV AUC_std']
            fmt = '%sC%d' % (['x', 'o', 's', 'D'][i], j)
            label = 'n_trials=%d, model_train_frac=%.2f' % (n_trials, frac)  # noqa
            plt.errorbar(x, y, xerr=xerr, yerr=yerr, fmt=fmt, label=label)

    plt.title('Parameter Comparison (dataset=%s)' % (name))
    plt.xlabel('Training Time[s]')
    plt.ylabel('CV AUC')
    plt.legend()
    plt.savefig(str(data_dir/'frac-and-n_trials.png'))

Source File: viirs_edr_active_fires.py From satpy with GNU General Public License v3.0

5 votes

def __init__(self, filename, filename_info, filetype_info):
        """Make sure filepath is valid and then reads data into a Dask DataFrame.

        Args:
            filename: Filename
            filename_info: Filename information
            filetype_info: Filetype information

        """
        skip_rows = filetype_info.get('skip_rows', 15)
        columns = filetype_info['columns']
        self.file_content = dd.read_csv(filename, skiprows=skip_rows, header=None, names=columns)
        super(VIIRSActiveFiresTextFileHandler, self).__init__(filename, filename_info, filetype_info)
        self.platform_name = PLATFORM_MAP.get(self.filename_info['satellite_name'].upper(), "unknown")

Source File: test_fit_predict.py From dask-lightgbm with BSD 3-Clause "New" or "Revised" License

5 votes

def test_classify_newsread(client, listen_port):
    data = dd.read_csv('./system_tests/data/*.gz', compression='gzip', blocksize=None)
    dX = data.iloc[:, :-1]
    dy = data.iloc[:, -1]

    d_classif = dlgbm.LGBMClassifier(n_estimators=50, local_listen_port=listen_port)
    d_classif.fit(dX, dy)

    dy_pred = d_classif.predict(dX, client=client)

    acc_score = (dy == dy_pred).sum() / len(dy)
    acc_score = acc_score.compute()
    print(acc_score)

    assert acc_score > 0.8

Source File: utils.py From pystore with Apache License 2.0

5 votes

def read_csv(urlpath, *args, **kwargs):
    def rename_dask_index(df, name):
        df.index.name = name
        return df

    index_col = index_name = None

    if "index" in kwargs:
        del kwargs["index"]
    if "index_col" in kwargs:
        index_col = kwargs["index_col"]
        if isinstance(index_col, list):
            index_col = index_col[0]
        del kwargs["index_col"]
    if "index_name" in kwargs:
        index_name = kwargs["index_name"]
        del kwargs["index_name"]

    df = dd.read_csv(urlpath, *args, **kwargs)

    if index_col is not None:
        df = df.set_index(index_col)

    if index_name is not None:
        df = df.map_partitions(rename_dask_index, index_name)

    return df

Source File: ml_100k.py From recommender-tensorflow with MIT License

5 votes

def load_data(src_dir="data/ml-100k"):
    data = {item: dd.read_csv(str(Path(src_dir, conf["filename"])), sep=conf["sep"],
                              header=None, names=conf["columns"], encoding="latin-1")
            for item, conf in DATA_CONFIG.items()}

    logger.info("data loaded.")
    return data

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

4 votes

def _parse_csv(self):
        """Reads csv files in dask to determine the datatypes and other features about data
        this helps in creating a dataset object in tensorflow

        Returns:
                df : dask dataframe, parsed dataframe object
                list(df.columns) : list, list of column names
        """
        if self.gcs_path:
            if isinstance(self.csv_path, list):
                for index, path in enumerate(self.csv_path):
                    parse_result = urlparse(path)
                    bucket = parse_result.hostname
                    csv_name = parse_result.path
                    self._download_csv(
                        bucket,
                        csv_name,
                        path_name='/tmp/data_' +
                                  str(index) +
                                  '.csv')
                csv_path = '/tmp/data_*.csv'
            else:
                parse_result = urlparse(self.csv_path)
                bucket = parse_result.hostname
                csv_name = parse_result.path
                self._download_csv(bucket, csv_name)
                csv_path = '/tmp/data.csv'
        else:
            csv_path = self.csv_path

        if self.column_names:
            header = None
        else:
            header = 'infer'

        try:
            df = dd.read_csv(
                csv_path,
                names=self.column_names,
                header=header,
                na_values=self.na_values,
                sample=12800000,
                dtype=self.data_type)
            if isinstance(csv_path, list):
                len(df)  # Checks whether schema is consistent throughout the data
        except Exception:
            raise AssertionError(
                'Data types given are inconsistent with data provided')

        if self.to_drop is not None:
            drop_column_names = self.to_drop
            drop_column_names = [
                name for name in drop_column_names if name in df.columns]
            df = self.drop_cols(df, drop_column_names)
            tf.logging.info('Dropping the columns : %s', drop_column_names)

        return df, list(df.columns)

Source File: tests-main.py From d6tpipe with MIT License

4 votes

def test_intro_stat_learning(self, cleanup, signup, testcfg):
        cfg_name = cfg_settings_islr['name']
        cfg_filenames_islr = ['Advertising.csv', 'Advertising2.csv', 'Auto.csv', 'Ch10Ex11.csv', 'College.csv', 'Credit.csv', 'Heart.csv', 'Income1.csv', 'Income2.csv', 'LICENSE.md', 'README.md']

        # start with local repo
        pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname)
        pipelocal.delete_files_local(confirm=False,delete_all=False)
        pipelocal.import_dir('tests/intro-stat-learning/')
        assert pipelocal.scan_local() == cfg_filenames_islr
        assert pipelocal.files() == []
        assert pipelocal.files(fromdb=False) == cfg_filenames_islr

        df = pd.read_csv(pipelocal.dirpath/'Advertising.csv')
        assert not df.empty

        if not testcfg.get('local', False):
            # set up public repo
            api = getapi()
            d6tpipe.upsert_pipe(api, cfg_settings_islr)
            d6tpipe.upsert_permissions(api, cfg_name, {"username": 'public', "role": "read"})
            pipe = d6tpipe.Pipe(api,cfg_name,mode='all')
            pipe.delete_files_remote(confirm=False)
            assert pipe.scan_remote(cached=False) == []
            assert pipe.push()==cfg_filenames_islr
            pipelocal = d6tpipe.PipeLocal(cfg_name, profile=cfg_profile, filecfg=cfg_cfgfname)
            assert len(pipelocal.schema)>0

            api2 = getapi2()
            pipe = d6tpipe.Pipe(api2,cfg_name)
            pipe.delete_files_local(confirm=False, delete_all=False)
            assert pipe.pull()==cfg_filenames_islr

            df = pd.read_csv(pipe.dirpath / 'Advertising.csv', **pipe.schema['pandas'])
            assert not df.empty

            import dask.dataframe as dd
            files = pipe.filepaths(include='Advertising*.csv')
            ddf = dd.read_csv(files, **pipe.schema['dask'])
            assert not ddf.compute().empty
            pipe.delete_files_local(confirm=False, delete_all=False)

        pipelocal.delete_files_local(confirm=False,delete_all=True)

Source File: tests-main.py From d6tpipe with MIT License

4 votes

def test_pipes_pull(self, cleanup, signup, parentinit, pipeinit, testcfg):
        api = getapi(testcfg.get('local',False))
        pipe = getpipe(api)
        assert pipe.name in api.list_pipes()

        cfg_chk_crc = ['8a9782e9efa8befa9752045ca506a62e',
         '5fe579d6b71031dad399e8d4ea82820b',
         '4c7da169df85253d7ff737dde1e7400b',
         'ca62a122993494e763fd1676cce95e76']

        # assert False
        assert pipe.files() == []
        assert pipe.scan_remote() == cfg_filenames_chk
        r, d = pipe.scan_remote(attributes=True)
        assert _filenames(d) == cfg_filenames_chk
        assert [o['crc'] for o in d]==cfg_chk_crc

        assert api.list_local_pipes()==[]
        assert pipe.pull_preview() == cfg_filenames_chk
        assert pipe.pull() == cfg_filenames_chk
        assert pipe.pull_preview() == []
        assert api.list_local_pipes()==[pipe.name]

        assert pipe.files() == cfg_filenames_chk
        assert pipe.filepaths() == [Path(pipe.dirpath)/f for f in pipe.files()]
        assert pipe.filepaths(aspathlib=False) == [str(Path(pipe.dirpath)/f) for f in pipe.files()]

        pipe = getpipe(api, chk_empty=False, mode='all')
        assert pipe.pull_preview() == cfg_filenames_chk

        # PipeLocal
        pipelocal = d6tpipe.PipeLocal(pipe.name,profile=cfg_profile, filecfg=cfg_cfgfname)
        assert pipelocal.files() == cfg_filenames_chk
        assert pipelocal.scan_local() == cfg_filenames_chk
        assert pipelocal.schema == cfg_settings_pipe['schema']
        df = pd.read_csv(pipe.dirpath/cfg_filenames_chk[0], **pipe.schema['pandas'])

        # permissions
        if not testcfg.get('local',False):
            api2 = getapi2(testcfg.get('local', False))
            with pytest.raises(APIError, match='403'):
                pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
                pipe2.pull()

            settings = {"username": cfg_usr2, "role": "read"}
            r,d = d6tpipe.upsert_permissions(api, cfg_parent_name, settings)

            pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
            assert pipe2.pull()==cfg_filenames_chk

        # cleanup
        pipe.delete_files_local(confirm=False,delete_all=True)

Python dask.dataframe.read_csv() Examples