Python Examples of cloudstorage.listbucket

Source File: input_readers.py From locality-sensitive-hashing with MIT License

6 votes

def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

Source File: workers.py From crmint with Apache License 2.0

6 votes

def _get_matching_stats(self, patterned_uris):
    stats = []
    patterns = {}
    for patterned_uri in patterned_uris:
      patterned_uri_split = patterned_uri.split('/')
      bucket = '/'.join(patterned_uri_split[1:3])
      pattern = '/'.join(patterned_uri_split[1:])
      try:
        if pattern not in patterns[bucket]:
          patterns[bucket].append(pattern)
      except KeyError:
        patterns[bucket] = [pattern]
    for bucket in patterns:
      for stat in gcs.listbucket(bucket):
        if not stat.is_dir:
          for pattern in patterns[bucket]:
            if fnmatch(stat.filename, pattern):
              stats.append(stat)
              break
    return stats

Source File: _gcs.py From locality-sensitive-hashing with MIT License

6 votes

def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

Source File: _gcs.py From appengine-mapreduce with Apache License 2.0

6 votes

def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

Source File: main.py From billing-export-python with Apache License 2.0

6 votes

def GetBillingProjects():
    """return a list of all projects we have billing export informaiton for."""
    projects = Projects.get_by_id('Projects')
    if projects is not None:
        logging.debug('using cached projects')
        return projects.projects
    project_list = []
    current_project = None
    for billing_object in gcs.listbucket(BUCKET,
                                         delimiter='/'):
        project_match = MatchProjectDate(billing_object.filename)
        if not project_match:
            continue
        project_name = project_match[0]
        if current_project != project_name:
            project_list.append(project_name)
            current_project = project_name
    projects = Projects(id='Projects')
    projects.projects = project_list
    projects.put()
    return project_list

Source File: input_readers.py From appengine-mapreduce with Apache License 2.0

6 votes

def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

Source File: output_writers.py From appengine-mapreduce with Apache License 2.0

6 votes

def _try_to_clean_garbage(self, writer_spec, exclude_list=()):
    """Tries to remove any files created by this shard that aren't needed.

    Args:
      writer_spec: writer_spec for the MR.
      exclude_list: A list of filenames (strings) that should not be
        removed.
    """
    # Try to remove garbage (if any). Note that listbucket is not strongly
    # consistent so something might survive.
    tmpl = string.Template(self._TMPFILE_PREFIX)
    prefix = tmpl.substitute(
        id=self.status.mapreduce_id, shard=self.status.shard)
    bucket = self._get_tmp_gcs_bucket(writer_spec)
    account_id = self._get_tmp_account_id(writer_spec)
    for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix),
                                     _account_id=account_id):
      if f.filename not in exclude_list:
        self._remove_tmpfile(f.filename, self.status.writer_spec)

Source File: main.py From python-docs-samples with Apache License 2.0

6 votes

def list_bucket(self, bucket):
        """Create several files and paginate through them."""

        self.response.write('Listbucket result:\n')

        # Production apps should set page_size to a practical value.
        page_size = 1
        stats = cloudstorage.listbucket(bucket + '/foo', max_keys=page_size)
        while True:
            count = 0
            for stat in stats:
                count += 1
                self.response.write(repr(stat))
                self.response.write('\n')

            if count != page_size or count == 0:
                break
            stats = cloudstorage.listbucket(
                bucket + '/foo', max_keys=page_size, marker=stat.filename)
# [END list_bucket]

Source File: output_writers.py From python-compat-runtime with Apache License 2.0

6 votes

def _try_to_clean_garbage(self, writer_spec, exclude_list=()):
    """Tries to remove any files created by this shard that aren't needed.

    Args:
      writer_spec: writer_spec for the MR.
      exclude_list: A list of filenames (strings) that should not be
        removed.
    """


    tmpl = string.Template(self._TMPFILE_PREFIX)
    prefix = tmpl.substitute(
        id=self.status.mapreduce_id, shard=self.status.shard)
    bucket = self._get_tmp_gcs_bucket(writer_spec)
    account_id = self._get_tmp_account_id(writer_spec)
    for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix),
                                     _account_id=account_id):
      if f.filename not in exclude_list:
        self._remove_tmpfile(f.filename, self.status.writer_spec)

Source File: input_readers.py From python-compat-runtime with Apache License 2.0

6 votes

def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)

Source File: workers.py From crmint with Apache License 2.0

5 votes

def _execute(self):
    self._get_ml_job_id()

    # Find directory where newest saved model is located
    bucket = self._params['jobDir']
    stats = gcs.listbucket(bucket[4:])
    newest_file = None

    for stat in stats:
      if stat.filename.find('saved_model.pb') != -1:
        if newest_file is None:
          newest_file = stat
          if newest_file:
            if stat.st_ctime > newest_file.st_ctime:
              newest_file = stat

    body = {
       	"name": self._params['versionName'],
       	"description": "Test from python",
       	"deploymentUri": ("gs:/" + newest_file.
                          filename[0:newest_file.filename.rfind('/')]),
       	"pythonVersion": self._params['pythonVersion'],
       	"runtimeVersion": self._params['runtimeVersion'],
       	"framework": self._params['framework']
    }

    project_id = 'projects/%s' % self._params['project']
    self._get_ml_client()
    request = self._ml_client.projects().models().versions().create(
        parent=project_id + "/models/" + self._params['modelName'], body=body)
    response = self.retry(request.execute)()
    self._enqueue('MLOperationWaiter', {'operation_name': response['name']}, 60)

Source File: storage.py From GAEPyPI with GNU General Public License v3.0

5 votes

def path_exists(self, path):
        match = list(gcs.listbucket(path.rstrip('/'), delimiter='/'))
        return path.rstrip('/') in [stat.filename.rstrip('/') for stat in match]

Source File: storage.py From GAEPyPI with GNU General Public License v3.0

5 votes

def file_exists(self, path):
        match = list(gcs.listbucket(path.rstrip('/')))
        return path.rstrip('/') in [stat.filename for stat in match]

Source File: storage.py From GAEPyPI with GNU General Public License v3.0

5 votes

def ls(self, path, dir_only=False):
        padded = path if path[-1] == '/' else path+'/'
        return [f.filename for f in gcs.listbucket(padded, delimiter='/') if f.is_dir or not dir_only]

Source File: gcs.py From luci-py with Apache License 2.0

5 votes

def list_files(bucket, subdir=None, batch_size=100):
  """Yields filenames and stats of files inside subdirectory of a bucket.

  It always lists directories recursively.

  Arguments:
    bucket: a bucket to list.
    subdir: subdirectory to list files from or None for an entire bucket.

  Yields:
    Tuples of (filename, stats), where filename is relative to the bucket root
    directory.
  """
  # When listing an entire bucket, gcs expects /<bucket> without ending '/'.
  path_prefix = '/%s/%s' % (bucket, subdir) if subdir else '/%s' % bucket
  bucket_prefix = '/%s/' % bucket
  marker = None
  retry_params = _make_retry_params()
  while True:
    files_stats = cloudstorage.listbucket(
        path_prefix=path_prefix,
        marker=marker,
        max_keys=batch_size,
        retry_params=retry_params)
    # |files_stats| is an iterable, need to iterate through it to figure out
    # whether it's empty or not.
    empty = True
    for stat in files_stats:
      # Restart next listing from the last fetched file.
      marker = stat.filename

      # pylint: disable=C0301
      # https://developers.google.com/appengine/docs/python/googlecloudstorageclient/gcsfilestat_class
      if stat.is_dir:
        continue
      empty = False
      assert stat.filename.startswith(bucket_prefix)
      yield stat.filename[len(bucket_prefix):], stat
    # Last batch was empty -> listed all files.
    if empty:
      break

Source File: main.py From python-docs-samples with Apache License 2.0

5 votes

def list_bucket_directory_mode(self, bucket):
        self.response.write('Listbucket directory mode result:\n')
        for stat in cloudstorage.listbucket(bucket + '/b', delimiter='/'):
            self.response.write(stat)
            self.response.write('\n')
            if stat.is_dir:
                for subdir_file in cloudstorage.listbucket(
                        stat.filename, delimiter='/'):
                    self.response.write('  {}'.format(subdir_file))
                    self.response.write('\n')

# [START delete_files]

Source File: main.py From python-docs-samples with Apache License 2.0

5 votes

def create_files_for_list_bucket(self, bucket):
        self.response.write('Creating more files for listbucket...\n')
        filenames = [bucket + n for n in [
            '/foo1', '/foo2', '/bar', '/bar/1', '/bar/2', '/boo/']]
        for f in filenames:
            self.create_file(f)

# [START list_bucket]

Source File: test.py From billing-export-python with Apache License 2.0

5 votes

def tearDown(self):
    # for gcs_object in gcs.listbucket(main.BUCKET):
    #  gcs.delete(gcs_object.filename)
    self.testbed.deactivate()

Source File: input_readers.py From locality-sensitive-hashing with MIT License

5 votes

def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_spec = _get_params(mapper_spec, allow_old=False)
    bucket = reader_spec[cls.BUCKET_NAME_PARAM]
    filenames = reader_spec[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_spec.get(cls.DELIMITER_PARAM)
    account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM)

    # Gather the complete list of files (expanding wildcards)
    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))

    # Split into shards
    readers = []
    for shard in range(0, mapper_spec.shard_count):
      shard_filenames = all_filenames[shard::mapper_spec.shard_count]
      if shard_filenames:
        readers.append(cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter))
    return readers

Source File: input_readers.py From python-compat-runtime with Apache License 2.0

4 votes

def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_spec = cls.get_params(mapper_spec, allow_old=False)
    bucket = reader_spec[cls.BUCKET_NAME_PARAM]
    filenames = reader_spec[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_spec.get(cls.DELIMITER_PARAM)
    account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM)
    fail_on_missing_input = reader_spec.get(cls.FAIL_ON_MISSING_INPUT)


    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))


    readers = []
    for shard in range(0, mapper_spec.shard_count):
      shard_filenames = all_filenames[shard::mapper_spec.shard_count]
      if shard_filenames:
        reader = cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter)
        reader._fail_on_missing_input = fail_on_missing_input
        readers.append(reader)
    return readers

Source File: main.py From billing-export-python with Apache License 2.0

4 votes

def GetDataTableData(project_name, table_date=None):
    """Read json files from cloud storage for project and an optional date.

    Args:
      project_name: name of the project to get data for.
      table_date: date object for when to get the data. When  None
      last 90 days of data is parsed.
    Returns:
      A DataTableData object of all the parsed data with product totals.
    """
    line_items = []
    date_hash = dict()
    object_prefix = os.path.join(BUCKET, project_name)
    object_marker = None
    if table_date is not None:
        object_prefix += table_date.strftime('-%Y-%m-%d.json')
    else:
        # query for last 90 days of data by using a 'marker' to start the
        # listing from, this limits the size of the chart data object
        # dropping older rows from the report.
        ninty_days_ago = date.today() + timedelta(-90)
        object_marker = object_prefix + \
            ninty_days_ago.strftime('-%Y-%m-%d.json')
    for billing_object in gcs.listbucket(object_prefix,
                                         marker=object_marker,
                                         delimiter='/'):
        billing_file = gcs.open(billing_object.filename)
        biling_data = json.loads(billing_file.read())
        for item in biling_data:
            end_time = datetime.strptime(
                item['endTime'][:-6], '%Y-%m-%dT%H:%M:%S')
            line_item = GetCanonicalLineItem(item['lineItemId'])
            if line_item not in line_items:
                line_items.append(line_item)
            row = date_hash.get(end_time, [])
            date_hash[end_time] = row
            coli = line_items.index(line_item)
            for _ in range(len(row), coli + 1):
                row.append(None)
            row[coli] = float(item['cost']['amount'])
        billing_file.close()

    # Add product totals to the parsed sku amounts.
    AddCloudProductSums(line_items, date_hash)
    data_table_data = [[bill_date] + row for bill_date, row in
                       date_hash.iteritems()]
    return DataTableData(data_table_data, line_items)

Source File: input_readers.py From appengine-mapreduce with Apache License 2.0

4 votes

def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_spec = cls.get_params(mapper_spec, allow_old=False)
    bucket = reader_spec[cls.BUCKET_NAME_PARAM]
    filenames = reader_spec[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_spec.get(cls.DELIMITER_PARAM)
    account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM)
    fail_on_missing_input = reader_spec.get(cls.FAIL_ON_MISSING_INPUT)

    # Gather the complete list of files (expanding wildcards)
    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))

    # Split into shards
    readers = []
    for shard in range(0, mapper_spec.shard_count):
      shard_filenames = all_filenames[shard::mapper_spec.shard_count]
      if shard_filenames:
        reader = cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter)
        reader._fail_on_missing_input = fail_on_missing_input
        readers.append(reader)
    return readers

Source File: _gcs.py From locality-sensitive-hashing with MIT License

4 votes

def split_input(cls, job_config):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      job_config: map_job.JobConfig

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_params = job_config.input_reader_params
    bucket = reader_params[cls.BUCKET_NAME_PARAM]
    filenames = reader_params[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_params.get(cls.DELIMITER_PARAM)
    account_id = reader_params.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM)
    path_filter = reader_params.get(cls.PATH_FILTER_PARAM)

    # Gather the complete list of files (expanding wildcards)
    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))

    # Split into shards
    readers = []
    for shard in range(0, job_config.shard_count):
      shard_filenames = all_filenames[shard::job_config.shard_count]
      if shard_filenames:
        readers.append(cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter, path_filter=path_filter))
    return readers

Source File: _gcs.py From appengine-mapreduce with Apache License 2.0

4 votes

def split_input(cls, job_config):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      job_config: map_job.JobConfig

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_params = job_config.input_reader_params
    bucket = reader_params[cls.BUCKET_NAME_PARAM]
    filenames = reader_params[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_params.get(cls.DELIMITER_PARAM)
    account_id = reader_params.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM)
    path_filter = reader_params.get(cls.PATH_FILTER_PARAM)

    # Gather the complete list of files (expanding wildcards)
    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))

    # Split into shards
    readers = []
    for shard in range(0, job_config.shard_count):
      shard_filenames = all_filenames[shard::job_config.shard_count]
      if shard_filenames:
        readers.append(cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter, path_filter=path_filter))
    return readers

Source File: output_writers_test.py From appengine-mapreduce with Apache License 2.0

4 votes

def testRemoveGarbage(self):
    """Make sure abandoned files get removed."""
    writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "unused",
                   self.WRITER_CLS.TMP_BUCKET_NAME_PARAM: "test"}
    mapreduce_state = self.create_mapreduce_state(output_params=writer_spec)
    shard_state = self.create_shard_state(1)
    ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
    context.Context._set(ctx)

    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number, 0)
    writer.begin_slice(None)

    # our shard
    our_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-1-very-random"
    f = cloudstorage.open(our_file, "w")
    f.write("foo?")
    f.close()

    # not our shard
    their_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-3-very-random"
    f = cloudstorage.open(their_file, "w")
    f.write("bar?")
    f.close()

    # unrelated file
    real_file = "/test/this_things_should_survive"
    f = cloudstorage.open(real_file, "w")
    f.write("yes, foobar!")
    f.close()

    # Make sure bogus file still exists
    names = [l.filename for l in cloudstorage.listbucket("/test")]
    self.assertTrue(our_file in names)
    self.assertTrue(their_file in names)
    self.assertTrue(real_file in names)

    # slice end should clean up the garbage
    writer = self._serialize_and_deserialize(writer)

    names = [l.filename for l in cloudstorage.listbucket("/test")]
    self.assertFalse(our_file in names)
    self.assertTrue(their_file in names)
    self.assertTrue(real_file in names)

    # finalize shouldn't change anything
    writer.finalize(ctx, shard_state)
    self.assertFalse(our_file in names)
    self.assertTrue(their_file in names)
    self.assertTrue(real_file in names)

Source File: output_writers_end_to_end_test.py From appengine-mapreduce with Apache License 2.0

4 votes

def _runTest(self, num_shards):
    entity_count = 1000
    bucket_name = "bucket"
    tmp_bucket_name = "tmp_bucket"
    job_name = "test_map"

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        job_name,
        __name__ + ".test_handler_yield_key_str",
        DATASTORE_READER_NAME,
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_writer": {
                "bucket_name": bucket_name,
                "tmp_bucket_name": tmp_bucket_name,
            },
        },
        shard_count=num_shards,
        output_writer_spec=self.WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)
    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

    self.assertEqual(num_shards, len(set(filenames)))
    total_entries = 0
    for shard in range(num_shards):
      self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name,
                                                              job_name)))
      data = cloudstorage.open(filenames[shard]).read()
      # strip() is used to remove the last newline of each file so that split()
      # does not retrun extraneous empty entries.
      total_entries += len(data.strip().split("\n"))
    self.assertEqual(entity_count, total_entries)

    # no files left in tmpbucket
    self.assertFalse(list(cloudstorage.listbucket("/%s" % tmp_bucket_name)))
    # and only expected files in regular bucket
    files_in_bucket = [
        f.filename for f in cloudstorage.listbucket("/%s" % bucket_name)]
    self.assertEquals(filenames, files_in_bucket)

Source File: mapreduce_pipeline_test.py From appengine-mapreduce with Apache License 2.0

4 votes

def testMapReduce(self):
    # Prepare test data
    bucket_name = "testbucket"
    job_name = "test_job"
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        job_name,
        __name__ + ".test_mapreduce_map",
        __name__ + ".test_mapreduce_reduce",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=(
            output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"),
        mapper_params={
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "bucket_name": bucket_name
        },
        reducer_params={
            "output_writer": {
                "bucket_name": bucket_name
            },
        },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                     p.outputs.result_status.value)
    output_data = []
    for output_file in p.outputs.default.value:
      with cloudstorage.open(output_file) as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = [
        str((str(d), ["", ""])) for d in range(entity_count)]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)

    # Verify that mapreduce doesn't leave intermediate files behind.
    temp_file_stats = cloudstorage.listbucket("/" + bucket_name)
    for stat in temp_file_stats:
      if stat.filename:
        self.assertFalse(
            stat.filename.startswith("/%s/%s-shuffle-" %
                                     (bucket_name, job_name)))

Source File: input_readers.py From browserscope with Apache License 2.0

4 votes

def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_spec = _get_params(mapper_spec, allow_old=False)

    # Gather the complete list of files (expanding wildcards)
    all_filenames = []
    bucket = reader_spec[cls.BUCKET_NAME_PARAM]
    filenames = reader_spec[cls.OBJECT_NAMES_PARAM]
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket,
                prefix=filename[:-1],
                _account_id=reader_spec.get(cls._ACCOUNT_ID_PARAM, None))])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))
        # The existence of an object can only be checked by sending requests to
        # GCS, thus this not performed at this time.


    # Split into shards
    readers = []
    for shard in range(0, mapper_spec.shard_count):
      shard_filenames = all_filenames[shard::mapper_spec.shard_count]
      if shard_filenames:
        readers.append(cls(
            shard_filenames,
            buffer_size=reader_spec.get(cls.BUFFER_SIZE_PARAM, None),
            _account_id=reader_spec.get(cls._ACCOUNT_ID_PARAM, None)))
    return readers

Python cloudstorage.listbucket() Examples