Python cloudstorage.listbucket() Examples
The following are 28
code examples of cloudstorage.listbucket().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
cloudstorage
, or try the search function
.
Example #1
Source File: input_readers.py From locality-sensitive-hashing with MIT License | 6 votes |
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
Example #2
Source File: workers.py From crmint with Apache License 2.0 | 6 votes |
def _get_matching_stats(self, patterned_uris): stats = [] patterns = {} for patterned_uri in patterned_uris: patterned_uri_split = patterned_uri.split('/') bucket = '/'.join(patterned_uri_split[1:3]) pattern = '/'.join(patterned_uri_split[1:]) try: if pattern not in patterns[bucket]: patterns[bucket].append(pattern) except KeyError: patterns[bucket] = [pattern] for bucket in patterns: for stat in gcs.listbucket(bucket): if not stat.is_dir: for pattern in patterns[bucket]: if fnmatch(stat.filename, pattern): stats.append(stat) break return stats
Example #3
Source File: _gcs.py From locality-sensitive-hashing with MIT License | 6 votes |
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
Example #4
Source File: _gcs.py From appengine-mapreduce with Apache License 2.0 | 6 votes |
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
Example #5
Source File: main.py From billing-export-python with Apache License 2.0 | 6 votes |
def GetBillingProjects(): """return a list of all projects we have billing export informaiton for.""" projects = Projects.get_by_id('Projects') if projects is not None: logging.debug('using cached projects') return projects.projects project_list = [] current_project = None for billing_object in gcs.listbucket(BUCKET, delimiter='/'): project_match = MatchProjectDate(billing_object.filename) if not project_match: continue project_name = project_match[0] if current_project != project_name: project_list.append(project_name) current_project = project_name projects = Projects(id='Projects') projects.projects = project_list projects.put() return project_list
Example #6
Source File: input_readers.py From appengine-mapreduce with Apache License 2.0 | 6 votes |
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
Example #7
Source File: output_writers.py From appengine-mapreduce with Apache License 2.0 | 6 votes |
def _try_to_clean_garbage(self, writer_spec, exclude_list=()): """Tries to remove any files created by this shard that aren't needed. Args: writer_spec: writer_spec for the MR. exclude_list: A list of filenames (strings) that should not be removed. """ # Try to remove garbage (if any). Note that listbucket is not strongly # consistent so something might survive. tmpl = string.Template(self._TMPFILE_PREFIX) prefix = tmpl.substitute( id=self.status.mapreduce_id, shard=self.status.shard) bucket = self._get_tmp_gcs_bucket(writer_spec) account_id = self._get_tmp_account_id(writer_spec) for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix), _account_id=account_id): if f.filename not in exclude_list: self._remove_tmpfile(f.filename, self.status.writer_spec)
Example #8
Source File: main.py From python-docs-samples with Apache License 2.0 | 6 votes |
def list_bucket(self, bucket): """Create several files and paginate through them.""" self.response.write('Listbucket result:\n') # Production apps should set page_size to a practical value. page_size = 1 stats = cloudstorage.listbucket(bucket + '/foo', max_keys=page_size) while True: count = 0 for stat in stats: count += 1 self.response.write(repr(stat)) self.response.write('\n') if count != page_size or count == 0: break stats = cloudstorage.listbucket( bucket + '/foo', max_keys=page_size, marker=stat.filename) # [END list_bucket]
Example #9
Source File: output_writers.py From python-compat-runtime with Apache License 2.0 | 6 votes |
def _try_to_clean_garbage(self, writer_spec, exclude_list=()): """Tries to remove any files created by this shard that aren't needed. Args: writer_spec: writer_spec for the MR. exclude_list: A list of filenames (strings) that should not be removed. """ tmpl = string.Template(self._TMPFILE_PREFIX) prefix = tmpl.substitute( id=self.status.mapreduce_id, shard=self.status.shard) bucket = self._get_tmp_gcs_bucket(writer_spec) account_id = self._get_tmp_account_id(writer_spec) for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix), _account_id=account_id): if f.filename not in exclude_list: self._remove_tmpfile(f.filename, self.status.writer_spec)
Example #10
Source File: input_readers.py From python-compat-runtime with Apache License 2.0 | 6 votes |
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
Example #11
Source File: workers.py From crmint with Apache License 2.0 | 5 votes |
def _execute(self): self._get_ml_job_id() # Find directory where newest saved model is located bucket = self._params['jobDir'] stats = gcs.listbucket(bucket[4:]) newest_file = None for stat in stats: if stat.filename.find('saved_model.pb') != -1: if newest_file is None: newest_file = stat if newest_file: if stat.st_ctime > newest_file.st_ctime: newest_file = stat body = { "name": self._params['versionName'], "description": "Test from python", "deploymentUri": ("gs:/" + newest_file. filename[0:newest_file.filename.rfind('/')]), "pythonVersion": self._params['pythonVersion'], "runtimeVersion": self._params['runtimeVersion'], "framework": self._params['framework'] } project_id = 'projects/%s' % self._params['project'] self._get_ml_client() request = self._ml_client.projects().models().versions().create( parent=project_id + "/models/" + self._params['modelName'], body=body) response = self.retry(request.execute)() self._enqueue('MLOperationWaiter', {'operation_name': response['name']}, 60)
Example #12
Source File: storage.py From GAEPyPI with GNU General Public License v3.0 | 5 votes |
def path_exists(self, path): match = list(gcs.listbucket(path.rstrip('/'), delimiter='/')) return path.rstrip('/') in [stat.filename.rstrip('/') for stat in match]
Example #13
Source File: storage.py From GAEPyPI with GNU General Public License v3.0 | 5 votes |
def file_exists(self, path): match = list(gcs.listbucket(path.rstrip('/'))) return path.rstrip('/') in [stat.filename for stat in match]
Example #14
Source File: storage.py From GAEPyPI with GNU General Public License v3.0 | 5 votes |
def ls(self, path, dir_only=False): padded = path if path[-1] == '/' else path+'/' return [f.filename for f in gcs.listbucket(padded, delimiter='/') if f.is_dir or not dir_only]
Example #15
Source File: gcs.py From luci-py with Apache License 2.0 | 5 votes |
def list_files(bucket, subdir=None, batch_size=100): """Yields filenames and stats of files inside subdirectory of a bucket. It always lists directories recursively. Arguments: bucket: a bucket to list. subdir: subdirectory to list files from or None for an entire bucket. Yields: Tuples of (filename, stats), where filename is relative to the bucket root directory. """ # When listing an entire bucket, gcs expects /<bucket> without ending '/'. path_prefix = '/%s/%s' % (bucket, subdir) if subdir else '/%s' % bucket bucket_prefix = '/%s/' % bucket marker = None retry_params = _make_retry_params() while True: files_stats = cloudstorage.listbucket( path_prefix=path_prefix, marker=marker, max_keys=batch_size, retry_params=retry_params) # |files_stats| is an iterable, need to iterate through it to figure out # whether it's empty or not. empty = True for stat in files_stats: # Restart next listing from the last fetched file. marker = stat.filename # pylint: disable=C0301 # https://developers.google.com/appengine/docs/python/googlecloudstorageclient/gcsfilestat_class if stat.is_dir: continue empty = False assert stat.filename.startswith(bucket_prefix) yield stat.filename[len(bucket_prefix):], stat # Last batch was empty -> listed all files. if empty: break
Example #16
Source File: main.py From python-docs-samples with Apache License 2.0 | 5 votes |
def list_bucket_directory_mode(self, bucket): self.response.write('Listbucket directory mode result:\n') for stat in cloudstorage.listbucket(bucket + '/b', delimiter='/'): self.response.write(stat) self.response.write('\n') if stat.is_dir: for subdir_file in cloudstorage.listbucket( stat.filename, delimiter='/'): self.response.write(' {}'.format(subdir_file)) self.response.write('\n') # [START delete_files]
Example #17
Source File: main.py From python-docs-samples with Apache License 2.0 | 5 votes |
def create_files_for_list_bucket(self, bucket): self.response.write('Creating more files for listbucket...\n') filenames = [bucket + n for n in [ '/foo1', '/foo2', '/bar', '/bar/1', '/bar/2', '/boo/']] for f in filenames: self.create_file(f) # [START list_bucket]
Example #18
Source File: test.py From billing-export-python with Apache License 2.0 | 5 votes |
def tearDown(self): # for gcs_object in gcs.listbucket(main.BUCKET): # gcs.delete(gcs_object.filename) self.testbed.deactivate()
Example #19
Source File: input_readers.py From locality-sensitive-hashing with MIT License | 5 votes |
def split_input(cls, mapper_spec): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: mapper_spec: an instance of model.MapperSpec. Returns: A list of InputReaders. None when no input data can be found. """ reader_spec = _get_params(mapper_spec, allow_old=False) bucket = reader_spec[cls.BUCKET_NAME_PARAM] filenames = reader_spec[cls.OBJECT_NAMES_PARAM] delimiter = reader_spec.get(cls.DELIMITER_PARAM) account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM) # Gather the complete list of files (expanding wildcards) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) # Split into shards readers = [] for shard in range(0, mapper_spec.shard_count): shard_filenames = all_filenames[shard::mapper_spec.shard_count] if shard_filenames: readers.append(cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter)) return readers
Example #20
Source File: input_readers.py From python-compat-runtime with Apache License 2.0 | 4 votes |
def split_input(cls, mapper_spec): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: mapper_spec: an instance of model.MapperSpec. Returns: A list of InputReaders. None when no input data can be found. """ reader_spec = cls.get_params(mapper_spec, allow_old=False) bucket = reader_spec[cls.BUCKET_NAME_PARAM] filenames = reader_spec[cls.OBJECT_NAMES_PARAM] delimiter = reader_spec.get(cls.DELIMITER_PARAM) account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM) fail_on_missing_input = reader_spec.get(cls.FAIL_ON_MISSING_INPUT) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) readers = [] for shard in range(0, mapper_spec.shard_count): shard_filenames = all_filenames[shard::mapper_spec.shard_count] if shard_filenames: reader = cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter) reader._fail_on_missing_input = fail_on_missing_input readers.append(reader) return readers
Example #21
Source File: main.py From billing-export-python with Apache License 2.0 | 4 votes |
def GetDataTableData(project_name, table_date=None): """Read json files from cloud storage for project and an optional date. Args: project_name: name of the project to get data for. table_date: date object for when to get the data. When None last 90 days of data is parsed. Returns: A DataTableData object of all the parsed data with product totals. """ line_items = [] date_hash = dict() object_prefix = os.path.join(BUCKET, project_name) object_marker = None if table_date is not None: object_prefix += table_date.strftime('-%Y-%m-%d.json') else: # query for last 90 days of data by using a 'marker' to start the # listing from, this limits the size of the chart data object # dropping older rows from the report. ninty_days_ago = date.today() + timedelta(-90) object_marker = object_prefix + \ ninty_days_ago.strftime('-%Y-%m-%d.json') for billing_object in gcs.listbucket(object_prefix, marker=object_marker, delimiter='/'): billing_file = gcs.open(billing_object.filename) biling_data = json.loads(billing_file.read()) for item in biling_data: end_time = datetime.strptime( item['endTime'][:-6], '%Y-%m-%dT%H:%M:%S') line_item = GetCanonicalLineItem(item['lineItemId']) if line_item not in line_items: line_items.append(line_item) row = date_hash.get(end_time, []) date_hash[end_time] = row coli = line_items.index(line_item) for _ in range(len(row), coli + 1): row.append(None) row[coli] = float(item['cost']['amount']) billing_file.close() # Add product totals to the parsed sku amounts. AddCloudProductSums(line_items, date_hash) data_table_data = [[bill_date] + row for bill_date, row in date_hash.iteritems()] return DataTableData(data_table_data, line_items)
Example #22
Source File: input_readers.py From appengine-mapreduce with Apache License 2.0 | 4 votes |
def split_input(cls, mapper_spec): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: mapper_spec: an instance of model.MapperSpec. Returns: A list of InputReaders. None when no input data can be found. """ reader_spec = cls.get_params(mapper_spec, allow_old=False) bucket = reader_spec[cls.BUCKET_NAME_PARAM] filenames = reader_spec[cls.OBJECT_NAMES_PARAM] delimiter = reader_spec.get(cls.DELIMITER_PARAM) account_id = reader_spec.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_spec.get(cls.BUFFER_SIZE_PARAM) fail_on_missing_input = reader_spec.get(cls.FAIL_ON_MISSING_INPUT) # Gather the complete list of files (expanding wildcards) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) # Split into shards readers = [] for shard in range(0, mapper_spec.shard_count): shard_filenames = all_filenames[shard::mapper_spec.shard_count] if shard_filenames: reader = cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter) reader._fail_on_missing_input = fail_on_missing_input readers.append(reader) return readers
Example #23
Source File: _gcs.py From locality-sensitive-hashing with MIT License | 4 votes |
def split_input(cls, job_config): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: job_config: map_job.JobConfig Returns: A list of InputReaders. None when no input data can be found. """ reader_params = job_config.input_reader_params bucket = reader_params[cls.BUCKET_NAME_PARAM] filenames = reader_params[cls.OBJECT_NAMES_PARAM] delimiter = reader_params.get(cls.DELIMITER_PARAM) account_id = reader_params.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM) path_filter = reader_params.get(cls.PATH_FILTER_PARAM) # Gather the complete list of files (expanding wildcards) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) # Split into shards readers = [] for shard in range(0, job_config.shard_count): shard_filenames = all_filenames[shard::job_config.shard_count] if shard_filenames: readers.append(cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter, path_filter=path_filter)) return readers
Example #24
Source File: _gcs.py From appengine-mapreduce with Apache License 2.0 | 4 votes |
def split_input(cls, job_config): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: job_config: map_job.JobConfig Returns: A list of InputReaders. None when no input data can be found. """ reader_params = job_config.input_reader_params bucket = reader_params[cls.BUCKET_NAME_PARAM] filenames = reader_params[cls.OBJECT_NAMES_PARAM] delimiter = reader_params.get(cls.DELIMITER_PARAM) account_id = reader_params.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM) path_filter = reader_params.get(cls.PATH_FILTER_PARAM) # Gather the complete list of files (expanding wildcards) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) # Split into shards readers = [] for shard in range(0, job_config.shard_count): shard_filenames = all_filenames[shard::job_config.shard_count] if shard_filenames: readers.append(cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter, path_filter=path_filter)) return readers
Example #25
Source File: output_writers_test.py From appengine-mapreduce with Apache License 2.0 | 4 votes |
def testRemoveGarbage(self): """Make sure abandoned files get removed.""" writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "unused", self.WRITER_CLS.TMP_BUCKET_NAME_PARAM: "test"} mapreduce_state = self.create_mapreduce_state(output_params=writer_spec) shard_state = self.create_shard_state(1) ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state) context.Context._set(ctx) writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec, shard_state.shard_number, 0) writer.begin_slice(None) # our shard our_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-1-very-random" f = cloudstorage.open(our_file, "w") f.write("foo?") f.close() # not our shard their_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-3-very-random" f = cloudstorage.open(their_file, "w") f.write("bar?") f.close() # unrelated file real_file = "/test/this_things_should_survive" f = cloudstorage.open(real_file, "w") f.write("yes, foobar!") f.close() # Make sure bogus file still exists names = [l.filename for l in cloudstorage.listbucket("/test")] self.assertTrue(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names) # slice end should clean up the garbage writer = self._serialize_and_deserialize(writer) names = [l.filename for l in cloudstorage.listbucket("/test")] self.assertFalse(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names) # finalize shouldn't change anything writer.finalize(ctx, shard_state) self.assertFalse(our_file in names) self.assertTrue(their_file in names) self.assertTrue(real_file in names)
Example #26
Source File: output_writers_end_to_end_test.py From appengine-mapreduce with Apache License 2.0 | 4 votes |
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" tmp_bucket_name = "tmp_bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, "tmp_bucket_name": tmp_bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(set(filenames))) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name, job_name))) data = cloudstorage.open(filenames[shard]).read() # strip() is used to remove the last newline of each file so that split() # does not retrun extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries) # no files left in tmpbucket self.assertFalse(list(cloudstorage.listbucket("/%s" % tmp_bucket_name))) # and only expected files in regular bucket files_in_bucket = [ f.filename for f in cloudstorage.listbucket("/%s" % bucket_name)] self.assertEquals(filenames, files_in_bucket)
Example #27
Source File: mapreduce_pipeline_test.py From appengine-mapreduce with Apache License 2.0 | 4 votes |
def testMapReduce(self): # Prepare test data bucket_name = "testbucket" job_name = "test_job" entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=( output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, "bucket_name": bucket_name }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. temp_file_stats = cloudstorage.listbucket("/" + bucket_name) for stat in temp_file_stats: if stat.filename: self.assertFalse( stat.filename.startswith("/%s/%s-shuffle-" % (bucket_name, job_name)))
Example #28
Source File: input_readers.py From browserscope with Apache License 2.0 | 4 votes |
def split_input(cls, mapper_spec): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: mapper_spec: an instance of model.MapperSpec. Returns: A list of InputReaders. None when no input data can be found. """ reader_spec = _get_params(mapper_spec, allow_old=False) # Gather the complete list of files (expanding wildcards) all_filenames = [] bucket = reader_spec[cls.BUCKET_NAME_PARAM] filenames = reader_spec[cls.OBJECT_NAMES_PARAM] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket, prefix=filename[:-1], _account_id=reader_spec.get(cls._ACCOUNT_ID_PARAM, None))]) else: all_filenames.append("/%s/%s" % (bucket, filename)) # The existence of an object can only be checked by sending requests to # GCS, thus this not performed at this time. # Split into shards readers = [] for shard in range(0, mapper_spec.shard_count): shard_filenames = all_filenames[shard::mapper_spec.shard_count] if shard_filenames: readers.append(cls( shard_filenames, buffer_size=reader_spec.get(cls.BUFFER_SIZE_PARAM, None), _account_id=reader_spec.get(cls._ACCOUNT_ID_PARAM, None))) return readers