Python tarfile.extract() Examples
The following are 12
code examples of tarfile.extract().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tarfile
, or try the search function
.
Example #1
Source File: base.py From anybox.recipe.odoo with GNU Affero General Public License v3.0 | 6 votes |
def sandboxed_tar_extract(self, sandbox, tarfile, first=None): """Extract those members that are below the tarfile path 'sandbox'. The tarfile module official doc warns against attacks with .. in tar. The option to start with a first member is useful for this case, since the recipe consumes a first member in the tar file to get the odoo main directory in parts. It is taken for granted that this first member has already been checked. """ if first is not None: tarfile.extract(first) for tinfo in tarfile: if tinfo.name.startswith(sandbox + '/'): tarfile.extract(tinfo) else: logger.warn('Tarball member %r is outside of %r. Ignored.', tinfo, sandbox)
Example #2
Source File: librispeech.py From fine-lm with MIT License | 5 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(None) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
Example #3
Source File: librispeech.py From tensor2tensor with Apache License 2.0 | 5 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(raw_data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
Example #4
Source File: librispeech.py From BERT with Apache License 2.0 | 5 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(raw_data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
Example #5
Source File: librispeech.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(raw_data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
Example #6
Source File: librispeech_specaugment.py From specAugment with Apache License 2.0 | 5 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(raw_data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
Example #7
Source File: base.py From anybox.recipe.odoo with GNU Affero General Public License v3.0 | 5 votes |
def cleanup_odoo_dir(self): """Revert local modifications that have been made during installation. These can be, e.g., forbidden by the freeze process.""" # from here we can't guess whether it's 'odoo' or 'odoo'. # Nothing guarantees that this method is called after develop(). # It is in practice now, but one day, the extraction as a separate # script of freeze/extract will become a reality. for proj_name in ('openerp', 'odoo'): egg_info_dir = join(self.odoo_dir, proj_name + '.egg-info') if os.path.exists(egg_info_dir): shutil.rmtree(egg_info_dir)
Example #8
Source File: common_voice.py From fine-lm with MIT License | 4 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download(tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(data_dir) encoders = self.feature_encoders(None) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
Example #9
Source File: common_voice.py From tensor2tensor with Apache License 2.0 | 4 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download(tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(raw_data_dir) encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
Example #10
Source File: common_voice.py From BERT with Apache License 2.0 | 4 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download(tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(raw_data_dir) encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
Example #11
Source File: common_voice.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download(tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(raw_data_dir) encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
Example #12
Source File: base.py From anybox.recipe.odoo with GNU Affero General Public License v3.0 | 4 votes |
def read_odoo_setup(self): """Ugly method to extract requirements & version from ugly setup.py. Primarily designed for 6.0, but works with 6.1 as well. """ old_setup = setuptools.setup old_distutils_setup = distutils.core.setup # 5.0 directly imports this def new_setup(*args, **kw): self.requirements.extend(kw.get('install_requires', ())) self.version_detected = kw['version'] setuptools.setup = new_setup distutils.core.setup = new_setup sys.path.insert(0, '.') with open(join(self.odoo_dir, 'setup.py'), 'rb') as f: saved_argv = sys.argv sys.argv = ['setup.py', 'develop'] try: imp.load_module('setup', f, 'setup.py', ('.py', 'r', imp.PY_SOURCE)) except SystemExit as exception: if 'dsextras' in unicode(exception): raise EnvironmentError( 'Please first install PyGObject and PyGTK !') else: try: self.read_release() except Exception as exc: raise EnvironmentError( 'Problem while reading Odoo release.py: %s' % exc) except ImportError as exception: if 'babel' in unicode(exception): raise EnvironmentError( 'OpenERP setup.py has an unwanted import Babel.\n' '=> First install Babel on your system or ' 'virtualenv :(\n' '(sudo aptitude install python-babel, ' 'or pip install babel)') else: raise exception except Exception as exception: raise EnvironmentError('Problem while reading Odoo ' 'setup.py: %s' % exception) finally: sys.argv = saved_argv sys.path.pop(0) setuptools.setup = old_setup distutils.core.setup = old_distutils_setup self.apply_version_dependent_decisions()