Python tensor2tensor.data_generators.generator_utils.maybe_download() Examples
The following are 30
code examples of tensor2tensor.data_generators.generator_utils.maybe_download().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensor2tensor.data_generators.generator_utils
, or try the search function
.
Example #1
Source File: fsns.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", list_url) fsns_files = [ f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) elif "/validation/validation" in url: generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
Example #2
Source File: google_robot_pushing.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): if dataset_split == problem.DatasetSplit.TRAIN: urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1]) else: urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1]) urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1]) for url in urls: path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url) for frame_number, frame, state, action in self.parse_frames(path): yield { "frame_number": [frame_number], "frame": frame, "state": state, "action": action, }
Example #3
Source File: style_transfer.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
Example #4
Source File: imdb.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
Example #5
Source File: fsns.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", list_url) fsns_files = [ f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) elif "/validation/validation" in url: generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
Example #6
Source File: imdb.py From fine-lm with MIT License | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
Example #7
Source File: lm1b.py From fine-lm with MIT License | 6 votes |
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url + ".en") vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set([ text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath) ])
Example #8
Source File: lm1b.py From tensor2tensor with Apache License 2.0 | 6 votes |
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url + ".en") vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set([ text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath) ])
Example #9
Source File: style_transfer.py From BERT with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
Example #10
Source File: enwik8.py From tensor2tensor with Apache License 2.0 | 6 votes |
def _maybe_download_corpus(tmp_dir): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. Returns: path to entire corpus as a text file. """ corpus_url = "http://mattmahoney.net/dc/enwik8.zip" corpus_filename = os.path.basename(corpus_url) compressed_filepath = generator_utils.maybe_download( tmp_dir, corpus_filename, corpus_url) zip_ref = zipfile.ZipFile(compressed_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return os.path.join(tmp_dir, "enwik8")
Example #11
Source File: multinli.py From tensor2tensor with Apache License 2.0 | 6 votes |
def _maybe_download_corpora(tmp_dir): """Download corpora for multinli. Args: tmp_dir: a string Returns: a string """ mnli_filename = "MNLI.zip" mnli_finalpath = os.path.join(tmp_dir, "MNLI") if not tf.gfile.Exists(mnli_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, mnli_filename, _MNLI_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return mnli_finalpath
Example #12
Source File: yelp_polarity.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) yelp_dir = os.path.join(tmp_dir, "yelp_review_polarity_csv") if not tf.gfile.Exists(yelp_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
Example #13
Source File: bair_robot_pushing.py From fine-lm with MIT License | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): path = generator_utils.maybe_download( tmp_dir, os.path.basename(DATA_URL), DATA_URL) tar = tarfile.open(path) tar.extractall(tmp_dir) tar.close() if dataset_split == problem.DatasetSplit.TRAIN: base_dir = os.path.join(tmp_dir, "softmotion30_44k/train/*") else: base_dir = os.path.join(tmp_dir, "softmotion30_44k/test/*") filenames = tf.gfile.Glob(base_dir) for frame_number, frame, state, action in self.parse_frames(filenames): yield { "frame_number": [frame_number], "frame": frame, "state": state, "action": action, }
Example #14
Source File: yelp_full.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) yelp_dir = os.path.join(tmp_dir, "yelp_review_full_csv") if not tf.gfile.Exists(yelp_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
Example #15
Source File: fsns.py From fine-lm with MIT License | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", list_url) fsns_files = [ f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) elif "/validation/validation" in url: generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
Example #16
Source File: mrpc.py From tensor2tensor with Apache License 2.0 | 6 votes |
def _maybe_download_corpora(self, tmp_dir): mrpc_dir = os.path.join(tmp_dir, self.DATA_DIR) tf.gfile.MakeDirs(mrpc_dir) mrpc_train_finalpath = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") mrpc_test_finalpath = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") mrpc_dev_ids_finalpath = os.path.join(mrpc_dir, "dev_ids.tsv") def download_file(tdir, filepath, url): if not tf.gfile.Exists(filepath): generator_utils.maybe_download(tdir, filepath, url) download_file(mrpc_dir, mrpc_train_finalpath, self.MRPC_TRAIN) download_file(mrpc_dir, mrpc_test_finalpath, self.MRPC_TEST) download_file(mrpc_dir, mrpc_dev_ids_finalpath, self.DEV_IDS) return mrpc_dir
Example #17
Source File: cnn_dailymail.py From fine-lm with MIT License | 6 votes |
def write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training): """Write text to files.""" def write_to_file(all_files, urls_path, tmp_dir, filename): with io.open(os.path.join(tmp_dir, filename + ".source"), "w") as fstory: with io.open(os.path.join(tmp_dir, filename + ".target"), "w") as fsummary: for example in example_generator(all_files, urls_path, sum_token=True): story, summary = _story_summary_split(example) fstory.write(story + "\n") fsummary.write(summary + "\n") filename = "cnndm.train" if is_training else "cnndm.dev" tf.logging.info("Writing %s" % filename) write_to_file(all_files, urls_path, tmp_dir, filename) if not is_training: test_urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt", _TEST_URLS) filename = "cnndm.test" tf.logging.info("Writing %s" % filename) write_to_file(all_files, test_urls_path, tmp_dir, filename)
Example #18
Source File: yelp_full.py From BERT with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) yelp_dir = os.path.join(tmp_dir, "yelp_review_full_csv") if not tf.gfile.Exists(yelp_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
Example #19
Source File: style_transfer.py From fine-lm with MIT License | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file = os.path.join(tmp_dir, tag + ".modern") target_file = os.path.join(tmp_dir, tag + ".original") return text_problems.text2text_txt_iterator(source_file, target_file)
Example #20
Source File: cifar.py From BERT with Apache License 2.0 | 5 votes |
def _get_cifar(directory, url): """Download and extract CIFAR to directory unless it is there.""" filename = os.path.basename(url) path = generator_utils.maybe_download(directory, filename, url) tarfile.open(path, "r:gz").extractall(directory)
Example #21
Source File: qnli.py From BERT with Apache License 2.0 | 5 votes |
def _maybe_download_corpora(self, tmp_dir): qnli_filename = "QNLI.zip" qnli_finalpath = os.path.join(tmp_dir, "QNLI") if not tf.gfile.Exists(qnli_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, qnli_filename, self._QNLI_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return qnli_finalpath
Example #22
Source File: lambada.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename): """Downloading and preparing the dataset. Args: tmp_dir: tem directory data_dir: data directory vocab_size: size of vocabulary vocab_filename: name of vocab file """ if not tf.gfile.Exists(data_dir): tf.gfile.MakeDirs(data_dir) file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL) tar_all = tarfile.open(file_path) tar_all.extractall(tmp_dir) tar_all.close() tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar")) tar_train.extractall(tmp_dir) tar_train.close() vocab_path = os.path.join(data_dir, vocab_filename) if not tf.gfile.Exists(vocab_path): with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile: reader = csv.reader(infile, delimiter="\t") words = [row[0] for row in reader] words = [_UNK] + words[:vocab_size] with tf.gfile.GFile(vocab_path, "w") as outfile: outfile.write("\n".join(words))
Example #23
Source File: image_lsun.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _get_lsun(directory, category, split_name): """Downloads all lsun files to directory unless they are there.""" generator_utils.maybe_download(directory, _LSUN_DATA_FILENAME % (category, split_name), _LSUN_URL % (category, split_name))
Example #24
Source File: cola.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _maybe_download_corpora(self, tmp_dir): cola_filename = "CoLA.zip" cola_finalpath = os.path.join(tmp_dir, "CoLA") if not tf.gfile.Exists(cola_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, cola_filename, self._COLA_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return cola_finalpath
Example #25
Source File: rte.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _maybe_download_corpora(self, tmp_dir): rte_filename = "RTE.zip" rte_finalpath = os.path.join(tmp_dir, "RTE") if not tf.gfile.Exists(rte_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, rte_filename, self._RTE_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return rte_finalpath
Example #26
Source File: snli.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _download_and_parse_dataset(tmp_dir, train): """Downloads and prepairs the dataset to be parsed by the data_generator.""" file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL) zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(tmp_dir) zip_ref.close() file_name = 'train' if train else 'dev' dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name) _parse_dataset(dataset_file_path, tmp_dir, train)
Example #27
Source File: lm1b.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _maybe_download_corpus(tmp_dir): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. """ corpus_url = ("http://www.statmt.org/lm-benchmark/" "1-billion-word-language-modeling-benchmark-r13output.tar.gz") corpus_filename = os.path.basename(corpus_url) corpus_filepath = os.path.join(tmp_dir, corpus_filename) if not os.path.exists(corpus_filepath): generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url) with tarfile.open(corpus_filepath, "r:gz") as corpus_tar: corpus_tar.extractall(tmp_dir)
Example #28
Source File: ptb.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _maybe_download_corpus(tmp_dir, vocab_type): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. vocab_type: which vocabulary are we using. Returns: The list of names of files. """ filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, PTB_URL) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: files = [] # Selecting only relevant files. for m in tgz.getmembers(): if "ptb" in m.name and ".txt" in m.name: if "char" in m.name: ptb_char_files += [m.name] else: ptb_files += [m.name] files += [m] tgz.extractall(tmp_dir, members=files) if vocab_type == text_problems.VocabType.CHARACTER: return ptb_char_files else: return ptb_files
Example #29
Source File: wiki.py From tensor2tensor with Apache License 2.0 | 5 votes |
def maybe_prepare_text(self, tmp_dir): """Download corpus if necessary, decompress, split into multiple text files. Args: tmp_dir: directory containing dataset. Returns: list of filepaths for local text files. """ compressed_filename = os.path.basename(self.corpus_url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) decompressed_filepath = compressed_filepath[:-4] split_file_prefix = decompressed_filepath + "-part-" split_filepattern = split_file_prefix + "?????" split_files = sorted(tf.gfile.Glob(split_filepattern)) if not split_files: if not tf.gfile.Exists(decompressed_filepath): if not tf.gfile.Exists(compressed_filepath): generator_utils.maybe_download( tmp_dir, compressed_filepath, self.corpus_url) assert not subprocess.call(["bunzip2", compressed_filepath]) assert tf.gfile.Exists(decompressed_filepath) assert not subprocess.call([ "split", "--line-bytes=4M", "--suffix-length=5", "--numeric-suffixes", decompressed_filepath, split_file_prefix]) split_files = sorted(tf.gfile.Glob(split_filepattern)) assert split_files return split_files
Example #30
Source File: quora_qpairs.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _maybe_download_corpora(self, tmp_dir): qqp_filename = "QQP.zip" qqp_finalpath = os.path.join(tmp_dir, "QQP") if not tf.gfile.Exists(qqp_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, qqp_filename, self._QQP_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return qqp_finalpath