Python s3fs.S3FileSystem() Examples

The following are 29 code examples of s3fs.S3FileSystem(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module s3fs , or try the search function .
Example #1
Source File: test_mldataset.py    From xcube with MIT License 8 votes vote down vote up
def test_s3_levels(self):
        with moto.mock_s3():
            self._write_test_cube_pyramid()

            s3 = s3fs.S3FileSystem(key='test_fake_id',
                                   secret='test_fake_secret',
                                   client_kwargs=dict(endpoint_url="https://s3.amazonaws.com"))
            ml_dataset = ObjectStorageMultiLevelDataset(s3,
                                                        "xcube-test/cube-1-250-250.levels",
                                                        chunk_cache_capacity=1000 * 1000 * 1000)
            self.assertIsNotNone(ml_dataset)
            self.assertEqual(3, ml_dataset.num_levels)
            self.assertEqual((250, 250), ml_dataset.tile_grid.tile_size)
            self.assertEqual(2, ml_dataset.tile_grid.num_level_zero_tiles_x)
            self.assertEqual(1, ml_dataset.tile_grid.num_level_zero_tiles_y)
            self.assertEqual(761904762, ml_dataset.get_chunk_cache_capacity(0))
            self.assertEqual(190476190, ml_dataset.get_chunk_cache_capacity(1))
            self.assertEqual(47619048, ml_dataset.get_chunk_cache_capacity(2)) 
Example #2
Source File: s3.py    From recruit with Apache License 2.0 7 votes vote down vote up
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None, mode=None):

    if mode is None:
        mode = 'rb'

    fs = s3fs.S3FileSystem(anon=False)
    try:
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    except (compat.FileNotFoundError, NoCredentialsError):
        # boto3 has troubles when trying to access a public file
        # when credentialed...
        # An OSError is raised if you have credentials, but they
        # aren't valid for that bucket.
        # A NoCredentialsError is raised if you don't have creds
        # for that bucket.
        fs = s3fs.S3FileSystem(anon=True)
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    return filepath_or_buffer, None, compression, True 
Example #3
Source File: s3.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None, mode=None):

    if mode is None:
        mode = 'rb'

    fs = s3fs.S3FileSystem(anon=False)
    try:
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    except (compat.FileNotFoundError, NoCredentialsError):
        # boto3 has troubles when trying to access a public file
        # when credentialed...
        # An OSError is raised if you have credentials, but they
        # aren't valid for that bucket.
        # A NoCredentialsError is raised if you don't have creds
        # for that bucket.
        fs = s3fs.S3FileSystem(anon=True)
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    return filepath_or_buffer, None, compression, True 
Example #4
Source File: s3.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None, mode=None):

    if mode is None:
        mode = 'rb'

    fs = s3fs.S3FileSystem(anon=False)
    try:
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    except (compat.FileNotFoundError, NoCredentialsError):
        # boto3 has troubles when trying to access a public file
        # when credentialed...
        # An OSError is raised if you have credentials, but they
        # aren't valid for that bucket.
        # A NoCredentialsError is raised if you don't have creds
        # for that bucket.
        fs = s3fs.S3FileSystem(anon=True)
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
    return filepath_or_buffer, None, compression, True 
Example #5
Source File: parquet.py    From timeserio with MIT License 6 votes vote down vote up
def __init__(
        self,
        *,
        path,
        batch_size=None,
        columns=None,
        batch_aggregator=1
    ):
        super().__init__()
        self.path = path
        self.batch_size = batch_size
        self.columns = columns
        self.batch_aggregator = batch_aggregator

        if self.path.startswith('s3://'):
            s3 = s3fs.S3FileSystem()
            self.files = [f's3://{file}' for file in s3.ls(self.path)]
        else:
            self.files = [
                os.path.join(path, file)
                for file in os.listdir(self.path)
            ]
        self.files = [f for f in self.files if f.endswith('.parquet')] 
Example #6
Source File: dsio.py    From xcube with MIT License 6 votes vote down vote up
def get_path_or_obs_store(path_or_url: str,
                          client_kwargs: Mapping[str, Any] = None,
                          mode: str = 'r') -> Tuple[Union[str, Dict], bool]:
    """
    If *path_or_url* is an object storage URL, return a object storage Zarr store (mapping object)
    using *client_kwargs* and *mode* and a flag indicating whether the Zarr datasets is consolidated.

    Otherwise *path_or_url* is interpreted as a local file system path, retured as-is plus
    a flag indicating whether the Zarr datasets is consolidated.

    :param path_or_url: A path or a URL.
    :param client_kwargs: Object storage client keyword arguments.
    :param mode: "r" or "w"
    :return: A tuple (path_or_obs_store, consolidated).
    """
    if is_obs_url(path_or_url):
        root, obs_fs_kwargs, obs_fs_client_kwargs = parse_obs_url_and_kwargs(path_or_url, client_kwargs)
        s3 = s3fs.S3FileSystem(**obs_fs_kwargs, client_kwargs=obs_fs_client_kwargs)
        consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata')
        return s3fs.S3Map(root=root, s3=s3, check=False, create=mode == "w"), consolidated
    else:
        consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata'))
        return path_or_url, consolidated 
Example #7
Source File: file_reader.py    From modin with Apache License 2.0 6 votes vote down vote up
def file_exists(cls, file_path):
        if isinstance(file_path, str):
            match = S3_ADDRESS_REGEX.search(file_path)
            if match is not None:
                if file_path[0] == "S":
                    file_path = "{}{}".format("s", file_path[1:])
                import s3fs as S3FS
                from botocore.exceptions import NoCredentialsError

                s3fs = S3FS.S3FileSystem(anon=False)
                exists = False
                try:
                    exists = s3fs.exists(file_path) or exists
                except NoCredentialsError:
                    pass
                s3fs = S3FS.S3FileSystem(anon=True)
                return exists or s3fs.exists(file_path)
        return os.path.exists(file_path) 
Example #8
Source File: data.py    From pyAFQ with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def s3fs_json_write(data, fname, fs=None):
    """
    Writes json from a dict directly into S3

    Parameters
    ----------
    data : dict
        The json to be written out
    fname : str
        Full path (including bucket name and extension) to the file to
        be written out on S3
    fs : an s3fs.S3FileSystem class instance, optional
        A file-system to refer to. Default to create a new file-system.
    """
    if fs is None:
        fs = s3fs.S3FileSystem()
    with fs.open(fname, 'w') as ff:
        json.dump(data, ff) 
Example #9
Source File: data.py    From pyAFQ with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def s3fs_json_read(fname, fs=None):
    """
    Reads json directly from S3

    Paramters
    ---------
    fname : str
        Full path (including bucket name and extension) to the file on S3.
    fs : an s3fs.S3FileSystem class instance, optional
        A file-system to refer to. Default to create a new file-system.

    """
    if fs is None:
        fs = s3fs.S3FileSystem()
    with fs.open(fname) as ff:
        data = json.load(ff)
    return data 
Example #10
Source File: data.py    From pyAFQ with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def s3fs_nifti_write(img, fname, fs=None):
    """
    Write a nifti file straight to S3

    Paramters
    ---------
    img : nib.Nifti1Image class instance
        The image containing data to be written into S3
    fname : string
        Full path (including bucket name and extension) to the S3 location
        where the file is to be saved.
    fs : an s3fs.S3FileSystem class instance, optional
        A file-system to refer to. Default to create a new file-system
    """
    if fs is None:
        fs = s3fs.S3FileSystem()

    bio = BytesIO()
    file_map = img.make_file_map({'image': bio, 'header': bio})
    img.to_file_map(file_map)
    data = gzip.compress(bio.getvalue())
    with fs.open(fname, 'wb') as ff:
        ff.write(data) 
Example #11
Source File: s3.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    fs = s3fs.S3FileSystem(anon=False)
    try:
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
    except (OSError, NoCredentialsError):
        # boto3 has troubles when trying to access a public file
        # when credentialed...
        # An OSError is raised if you have credentials, but they
        # aren't valid for that bucket.
        # A NoCredentialsError is raised if you don't have creds
        # for that bucket.
        fs = s3fs.S3FileSystem(anon=True)
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
    return filepath_or_buffer, None, compression 
Example #12
Source File: pickle.py    From timeserio with MIT License 5 votes vote down vote up
def open_url(filename, mode):
    """Open file from local drive or s3 bucket.

    S3 filename must start with `s3://`.
    """
    if filename.startswith('s3://'):
        s3 = s3fs.S3FileSystem()
        file = s3.open(filename, mode)
    else:
        file = open(filename, mode)
    return file 
Example #13
Source File: io.py    From cate with MIT License 5 votes vote down vote up
def read_zarr(path: str,
              file_system: str = 'Local',
              drop_variables: VarNamesLike.TYPE = None,
              decode_cf: bool = True,
              decode_times: bool = True,
              normalize: bool = True) -> xr.Dataset:
    """
    Read a dataset from a Zarr directory, Zarr ZIP archive, or remote Zarr object storage.

    For the Zarr format, refer to http://zarr.readthedocs.io/en/stable/.

    :param path: Zarr directory path, Zarr ZIP archive path, or object storage path or bucket name.
    :param file_system: File system identifier, "Local" is your locally mounted file system,
           for Amazon S3 use "S3", for general Object Storage use "OBS".
    :param drop_variables: List of variables to be dropped.
    :param decode_cf: Whether to decode CF attributes and coordinate variables.
    :param decode_times: Whether to decode time information (convert time coordinates to ``datetime`` objects).
    :param normalize: Whether to normalize the dataset's geo- and time-coding upon opening. See operation ``normalize``.
    """
    drop_variables = VarNamesLike.convert(drop_variables)

    if file_system == 'Local':
        ds = xr.open_zarr(path,
                          drop_variables=drop_variables,
                          decode_cf=decode_cf,
                          decode_times=decode_times)
    elif file_system == 'S3' or file_system == 'OBS':
        import s3fs
        store = s3fs.S3Map(path, s3=(s3fs.S3FileSystem(anon=True)))
        ds = xr.open_zarr(store,
                          drop_variables=drop_variables,
                          decode_cf=decode_cf,
                          decode_times=decode_times)
    else:
        raise ValidationError(f'Unknown file_system {file_system!r}')

    if normalize:
        return adjust_temporal_attrs(normalize_op(ds))
    return ds 
Example #14
Source File: conftest.py    From timeserio with MIT License 5 votes vote down vote up
def s3(test_bucket_name):
    # writable local S3 system
    with moto.mock_s3():
        client = boto3.client('s3')
        client.create_bucket(Bucket=test_bucket_name)
        yield s3fs.S3FileSystem() 
Example #15
Source File: s3.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    fs = s3fs.S3FileSystem(anon=False)
    try:
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
    except (OSError, NoCredentialsError):
        # boto3 has troubles when trying to access a public file
        # when credentialed...
        # An OSError is raised if you have credentials, but they
        # aren't valid for that bucket.
        # A NoCredentialsError is raised if you don't have creds
        # for that bucket.
        fs = s3fs.S3FileSystem(anon=True)
        filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
    return filepath_or_buffer, None, compression 
Example #16
Source File: persistence.py    From palladium with Apache License 2.0 5 votes vote down vote up
def __init__(self, **kwargs):
        try:
            import s3fs
        except ImportError:
            raise ImportError('S3IO needs the s3fs module to work correctly.')

        self.fs = s3fs.S3FileSystem(anon=False) 
Example #17
Source File: s3_fs.py    From s3contents with Apache License 2.0 5 votes vote down vote up
def __init__(self, log, **kwargs):
        super(S3FS, self).__init__(**kwargs)
        self.log = log

        client_kwargs = {
            "endpoint_url": self.endpoint_url,
            "region_name": self.region_name,
        }
        config_kwargs = {}
        if self.signature_version:
            config_kwargs["signature_version"] = self.signature_version
        s3_additional_kwargs = {}
        if self.sse:
            s3_additional_kwargs["ServerSideEncryption"] = self.sse
        if self.kms_key_id:
            s3_additional_kwargs["SSEKMSKeyId"] = self.kms_key_id

        self.fs = s3fs.S3FileSystem(
            key=self.access_key_id,
            secret=self.secret_access_key,
            token=self.session_token,
            client_kwargs=client_kwargs,
            config_kwargs=config_kwargs,
            s3_additional_kwargs=s3_additional_kwargs,
            session=self.boto3_session,
        )

        self.init() 
Example #18
Source File: mldataset.py    From xcube with MIT License 5 votes vote down vote up
def open_ml_dataset_from_object_storage(path: str,
                                        data_format: str = None,
                                        ds_id: str = None,
                                        exception_type: type = ValueError,
                                        client_kwargs: Mapping[str, Any] = None,
                                        chunk_cache_capacity: int = None,
                                        **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    root, obs_fs_kwargs, obs_fs_client_kwargs = parse_obs_url_and_kwargs(path, client_kwargs)
    obs_fs = s3fs.S3FileSystem(**obs_fs_kwargs, client_kwargs=obs_fs_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=root, s3=obs_fs, check=False)
        if chunk_cache_capacity:
            store = zarr.LRUStoreCache(store, max_size=chunk_cache_capacity)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_fs.exists(f'{root}/.zmetadata')
            ds = assert_cube(xr.open_zarr(store, consolidated=consolidated, **kwargs))
        return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(obs_fs,
                                                  root,
                                                  zarr_kwargs=kwargs,
                                                  ds_id=ds_id,
                                                  chunk_cache_capacity=chunk_cache_capacity,
                                                  exception_type=exception_type)

    raise exception_type(f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}') 
Example #19
Source File: dataset.py    From xcube with MIT License 5 votes vote down vote up
def __init__(self, s3_fs: s3fs.S3FileSystem = None):
        self._s3_fs = s3_fs

    # noinspection PyUnusedLocal,PyMethodMayBeStatic 
Example #20
Source File: dsio.py    From xcube with MIT License 5 votes vote down vote up
def parse_obs_url_and_kwargs(obs_url: str, obs_kwargs: Mapping[str, Any]) -> Tuple[str, Dict[str, Any], Dict[str, Any]]:
    """
    Parses *obs_url* and *kwargs* and returns a
    tuple (*root*, *kwargs*, *client_kwargs*) whose elements
    can be passed to the s3fs.S3FileSystem and s3fs.S3Map constructors as follows:::

        obs_fs = s3fs.S3FileSystem(**kwargs, client_kwargs=client_kwargs)
        obs_map = s3fs.S3Map(root=root, s3=obs_fs)

    :param obs_url: Object storage URL, e.g. "s3://bucket/root", or "https://bucket.s3.amazonaws.com/root".
    :param obs_kwargs: Keyword arguments.
    :return: A tuple (root, kwargs, client_kwargs).
    """

    anon = True
    key = None
    secret = None
    client_kwargs = dict(obs_kwargs) if obs_kwargs else dict()

    endpoint_url, root = split_obs_url(obs_url)
    if endpoint_url:
        client_kwargs['endpoint_url'] = endpoint_url

    if 'provider_access_key_id' in client_kwargs:
        key = client_kwargs.pop('provider_access_key_id')
    if 'aws_access_key_id' in client_kwargs:
        key = client_kwargs.pop('aws_access_key_id')
    if 'provider_secret_access_key' in client_kwargs:
        secret = client_kwargs.pop('provider_secret_access_key')
    if 'aws_secret_access_key' in client_kwargs:
        secret = client_kwargs.pop('aws_secret_access_key')
    if key and secret:
        anon = False
    else:
        key = secret = None

    return root, dict(anon=anon, key=key, secret=secret), client_kwargs 
Example #21
Source File: test_timeslice.py    From xcube with MIT License 5 votes vote down vote up
def test_remote(self):
        import s3fs
        endpoint_url = "http://obs.eu-de.otc.t-systems.com"
        s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url))
        s3_store = s3fs.S3Map(root="cyanoalert/cyanoalert-olci-lswe-l2c-v1.zarr", s3=s3, check=False)
        diagnostic_store = DiagnosticStore(s3_store, logging_observer(log_path='remote-cube.log'))
        xr.open_zarr(diagnostic_store) 
Example #22
Source File: test_commoncrawl.py    From CommonCrawlJob with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        self.s3 = S3FileSystem(anon=True, use_ssl=False)
        self.key = '/'.join([
            'common-crawl',
            'crawl-data',
            'CC-MAIN-2016-07',
            'segments',
            '1454702039825.90',
            'warc',
            'CC-MAIN-20160205195359-00348-ip-10-236-182-209.ec2.internal.warc.gz',
        ])
        self.s3_url = 's3://aws-publicdatasets/{key}'.format(key=self.key) 
Example #23
Source File: data.py    From pyAFQ with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def s3fs_nifti_read(fname, fs=None):
    """
    Lazily reads a nifti image from S3.

    Paramters
    ---------
    fname : string
        Full path (including bucket name and extension) to the S3 location
        of the file to be read.
    fs : an s3fs.S3FileSystem class instance, optional
        A file-system to refer to. Default to create a new file-system.

    Returns
    -------
    nib.Nifti1Image class instance

    Note
    ----
    Because the image is lazily loaded, data stored in the file
    is not transferred until `get_fdata` is called.

    """
    if fs is None:
        fs = s3fs.S3FileSystem()
    with fs.open(fname) as ff:
        zz = gzip.open(ff)
        rr = zz.read()
        bb = BytesIO(rr)
        fh = nib.FileHolder(fileobj=bb)
        img = nib.Nifti1Image.from_file_map({'header': fh, 'image': fh})
    return img 
Example #24
Source File: s3.py    From spectrify with MIT License 5 votes vote down vote up
def get_fs(self):
        return s3fs.S3FileSystem(anon=False, default_block_size=SPECTRIFY_BLOCKSIZE) 
Example #25
Source File: mldataset.py    From xcube with MIT License 4 votes vote down vote up
def __init__(self,
                 s3_file_system: s3fs.S3FileSystem,
                 dir_path: str,
                 zarr_kwargs: Dict[str, Any] = None,
                 ds_id: str = None,
                 chunk_cache_capacity: int = None,
                 exception_type: type = ValueError):

        level_paths = {}
        entries = s3_file_system.ls(dir_path, detail=False)
        for entry in entries:
            level_dir = entry.split("/")[-1]
            basename, ext = os.path.splitext(level_dir)
            if basename.isdigit():
                level = int(basename)
                if entry.endswith(".zarr") and s3_file_system.isdir(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)
                elif entry.endswith(".link") and s3_file_system.isfile(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)

        num_levels = len(level_paths)
        # Consistency check
        for level in range(num_levels):
            if level not in level_paths:
                raise exception_type(f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}")

        super().__init__(ds_id=ds_id, parameters=zarr_kwargs)
        self._s3_file_system = s3_file_system
        self._dir_path = dir_path
        self._level_paths = level_paths
        self._num_levels = num_levels

        self._chunk_cache_capacities = None
        if chunk_cache_capacity:
            weights = []
            weigth_sum = 0
            for level in range(num_levels):
                weight = 2 ** (num_levels - 1 - level)
                weight *= weight
                weigth_sum += weight
                weights.append(weight)
            self._chunk_cache_capacities = [round(chunk_cache_capacity * weight / weigth_sum)
                                            for weight in weights] 
Example #26
Source File: parquet.py    From timeserio with MIT License 4 votes vote down vote up
def __init__(
        self,
        *,
        path,
        batch_size=None,
        sequence_length=2,
        id_column=ini.Columns.id,
        sequence_columns=[ini.Columns.datetime, ini.Columns.target],
        sequence_prefix='seq_',
        last_step_columns=[ini.Columns.datetime],
        last_step_prefix='end_of_',
        forecast_steps_min=1,
        forecast_steps_max=1,
        batch_offset=False,
        batch_offset_period=1,
        dt_column=ini.Columns.datetime,
        start_time=None,
        batch_aggregator=1
    ):
        super().__init__()
        self.path = path
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.id_column = id_column
        self.sequence_columns = sequence_columns
        self.sequence_prefix = sequence_prefix
        self.last_step_columns = last_step_columns
        self.last_step_prefix = last_step_prefix
        self.forecast_steps_min = forecast_steps_min
        self.forecast_steps_max = forecast_steps_max
        self.batch_offset = batch_offset
        self.batch_offset_period = batch_offset_period
        self.dt_column = dt_column
        self.start_time = start_time
        self.batch_aggregator = batch_aggregator

        if self.path.startswith('s3://'):
            s3 = s3fs.S3FileSystem()
            self.files = [f's3://{file}' for file in s3.ls(self.path)]
        else:
            self.files = [
                os.path.join(path, file)
                for file in os.listdir(self.path)
            ]
        self.files = [f for f in self.files if f.endswith('.parquet')] 
Example #27
Source File: show_remote_cubes.py    From xcube with MIT License 4 votes vote down vote up
def show_remote_cubes(bucket, endpoint_url, region_name='eu-central-1'):
    s3_client_kwargs = {}
    s3_client_kwargs['endpoint_url'] = endpoint_url
    s3_client_kwargs['region_name'] = region_name
    obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs)

    cube_names = []
    df = pd.DataFrame(
        columns=['cube_name', 'chunks', 'number_of_variables', 'variables',
                 'start_date', 'end_date', 'spatial_coverage'])

    for filepath in sorted(obs_file_system.ls(bucket)):
        if filepath.endswith('.zarr'):
            with open_cube(f'{endpoint_url}/{filepath}') as ds:
                var_list = list(ds.data_vars)
                cube_names.append(filepath)
                filename = filepath.split('/')[1]
                sd = pd.to_datetime(str(ds.time.values[0]))
                start_date = sd.strftime('%Y-%m-%d')
                ed = pd.to_datetime(str(ds.time.values[-1]))
                end_date = ed.strftime('%Y-%m-%d')
                chunksize = []
                for idx, dim in enumerate(ds[var_list[0]].dims):
                    chunksize.append(f"{dim}: {ds[var_list[0]].data.chunksize[idx]}")
                try:
                    spat_cov = ([
                        f"lon_min: {ds.attrs['geospatial_lon_min']}",
                        f"lat_min: {ds.attrs['geospatial_lat_min']}",
                        f"lon_max: {ds.attrs['geospatial_lon_max']}",
                        f"lat_max: {ds.attrs['geospatial_lat_max']}"])
                except KeyError:
                    spat_cov = None
                df = df.append({'cube_name': filename,
                                'chunks': ', '.join(chunksize),
                                'number_of_variables': len(var_list),
                                'variables': ', '.join(var_list),
                                'start_date': start_date,
                                'end_date': end_date,
                                'spatial_coverage': ', '.join(spat_cov)},
                               ignore_index=True)
    # Make the variables column wide enough:
    df.style.set_properties(subset=['variables'], width='300px')                        
    return df 
Example #28
Source File: file_reader.py    From modin with Apache License 2.0 4 votes vote down vote up
def file_open(cls, file_path, mode="rb", compression="infer"):
        if isinstance(file_path, str):
            match = S3_ADDRESS_REGEX.search(file_path)
            if match is not None:
                if file_path[0] == "S":
                    file_path = "{}{}".format("s", file_path[1:])
                import s3fs as S3FS
                from botocore.exceptions import NoCredentialsError

                s3fs = S3FS.S3FileSystem(anon=False)
                try:
                    return s3fs.open(file_path)
                except NoCredentialsError:
                    s3fs = S3FS.S3FileSystem(anon=True)
                    return s3fs.open(file_path)
            elif compression == "gzip":
                import gzip

                return gzip.open(file_path, mode=mode)
            elif compression == "bz2":
                import bz2

                return bz2.BZ2File(file_path, mode=mode)
            elif compression == "xz":
                import lzma

                return lzma.LZMAFile(file_path, mode=mode)
            elif compression == "zip":
                import zipfile

                zf = zipfile.ZipFile(file_path, mode=mode.replace("b", ""))
                if zf.mode == "w":
                    return zf
                elif zf.mode == "r":
                    zip_names = zf.namelist()
                    if len(zip_names) == 1:
                        f = zf.open(zip_names.pop())
                        return f
                    elif len(zip_names) == 0:
                        raise ValueError(
                            "Zero files found in ZIP file {}".format(file_path)
                        )
                    else:
                        raise ValueError(
                            "Multiple files found in ZIP file."
                            " Only one file per ZIP: {}".format(zip_names)
                        )

        return open(file_path, mode=mode) 
Example #29
Source File: estimator.py    From gluon-ts with Apache License 2.0 4 votes vote down vote up
def __init__(
        self,
        sagemaker_session: sagemaker.Session,
        role: str,
        image_name: str,
        base_job_name: str,
        train_instance_type: str = "ml.c5.xlarge",
        train_instance_count: int = 1,
        dependencies: Optional[List[str]] = None,
        output_path: str = None,
        code_location: str = None,
        framework_version: str = GLUONTS_VERSION,
        hyperparameters: Dict = None,
        entry_point: str = str(ENTRY_POINTS_FOLDER / TRAIN_SCRIPT),
        **kwargs,
    ):
        # Framework_version currently serves no purpose,
        # except for compatibility with the sagemaker framework.
        if framework_version is None:
            logger.warning(
                empty_framework_version_warning(
                    GLUONTS_VERSION, self.LATEST_VERSION
                )
            )
        self.framework_version = framework_version or GLUONTS_VERSION

        super().__init__(
            dependencies=dependencies,
            output_path=output_path,
            code_location=code_location,
            sagemaker_session=sagemaker_session,
            role=role,
            train_instance_type=train_instance_type,
            train_instance_count=train_instance_count,
            base_job_name=base_job_name,
            entry_point=entry_point,
            hyperparameters=hyperparameters,
            image_name=image_name,
            **kwargs,
        )

        # must be set
        self.py_version = PYTHON_VERSION

        # automatically retrieves credentials using context manager, see: https://s3fs.readthedocs.io/en/latest/
        self._s3fs = s3fs.S3FileSystem()