Python Examples of tensor2tensor.data_generators.generator_utils.maybe

Source File: fsns.py From BERT with Apache License 2.0

6 votes

def generate_data(self, data_dir, tmp_dir, task_id=-1):
    list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                "street/python/fsns_urls.txt")
    fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
                                               list_url)
    fsns_files = [
        f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
    ]
    for url in fsns_files:
      if "/train/train" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
      elif "/validation/validation" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
      elif "charset" in url:
        generator_utils.maybe_download(data_dir, "charset_size134.txt", url)

Source File: google_robot_pushing.py From tensor2tensor with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    if dataset_split == problem.DatasetSplit.TRAIN:
      urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1])
    else:
      urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1])
      urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1])

    for url in urls:
      path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url)
      for frame_number, frame, state, action in self.parse_frames(path):
        yield {
            "frame_number": [frame_number],
            "frame": frame,
            "state": state,
            "action": action,
        }

Source File: style_transfer.py From tensor2tensor with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)

Source File: imdb.py From tensor2tensor with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    imdb_dir = os.path.join(tmp_dir, "aclImdb")
    if not tf.gfile.Exists(imdb_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }

Source File: fsns.py From tensor2tensor with Apache License 2.0

6 votes

def generate_data(self, data_dir, tmp_dir, task_id=-1):
    list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                "street/python/fsns_urls.txt")
    fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
                                               list_url)
    fsns_files = [
        f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
    ]
    for url in fsns_files:
      if "/train/train" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
      elif "/validation/validation" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
      elif "charset" in url:
        generator_utils.maybe_download(data_dir, "charset_size134.txt", url)

Source File: imdb.py From fine-lm with MIT License

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    imdb_dir = os.path.join(tmp_dir, "aclImdb")
    if not tf.gfile.Exists(imdb_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }

Source File: lm1b.py From fine-lm with MIT License

6 votes

def _original_vocab(tmp_dir):
  """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
  vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
               "vocab-2016-09-10.txt")
  vocab_filename = os.path.basename(vocab_url + ".en")
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if not os.path.exists(vocab_filepath):
    generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
  return set([
      text_encoder.native_to_unicode(l.strip())
      for l in tf.gfile.Open(vocab_filepath)
  ])

Source File: lm1b.py From tensor2tensor with Apache License 2.0

6 votes

def _original_vocab(tmp_dir):
  """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
  vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
               "vocab-2016-09-10.txt")
  vocab_filename = os.path.basename(vocab_url + ".en")
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if not os.path.exists(vocab_filepath):
    generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
  return set([
      text_encoder.native_to_unicode(l.strip())
      for l in tf.gfile.Open(vocab_filepath)
  ])

Source File: style_transfer.py From BERT with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)

Source File: enwik8.py From tensor2tensor with Apache License 2.0

6 votes

def _maybe_download_corpus(tmp_dir):
  """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    path to entire corpus as a text file.
  """
  corpus_url = "http://mattmahoney.net/dc/enwik8.zip"
  corpus_filename = os.path.basename(corpus_url)
  compressed_filepath = generator_utils.maybe_download(
      tmp_dir, corpus_filename, corpus_url)

  zip_ref = zipfile.ZipFile(compressed_filepath, "r")
  zip_ref.extractall(tmp_dir)
  zip_ref.close()

  return os.path.join(tmp_dir, "enwik8")

Source File: multinli.py From tensor2tensor with Apache License 2.0

6 votes

def _maybe_download_corpora(tmp_dir):
  """Download corpora for multinli.

  Args:
    tmp_dir: a string
  Returns:
    a string
  """
  mnli_filename = "MNLI.zip"
  mnli_finalpath = os.path.join(tmp_dir, "MNLI")
  if not tf.gfile.Exists(mnli_finalpath):
    zip_filepath = generator_utils.maybe_download(
        tmp_dir, mnli_filename, _MNLI_URL)
    zip_ref = zipfile.ZipFile(zip_filepath, "r")
    zip_ref.extractall(tmp_dir)
    zip_ref.close()

  return mnli_finalpath

Source File: yelp_polarity.py From tensor2tensor with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    yelp_dir = os.path.join(tmp_dir, "yelp_review_polarity_csv")
    if not tf.gfile.Exists(yelp_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }

Source File: bair_robot_pushing.py From fine-lm with MIT License

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    path = generator_utils.maybe_download(
        tmp_dir, os.path.basename(DATA_URL), DATA_URL)

    tar = tarfile.open(path)
    tar.extractall(tmp_dir)
    tar.close()

    if dataset_split == problem.DatasetSplit.TRAIN:
      base_dir = os.path.join(tmp_dir, "softmotion30_44k/train/*")
    else:
      base_dir = os.path.join(tmp_dir, "softmotion30_44k/test/*")

    filenames = tf.gfile.Glob(base_dir)
    for frame_number, frame, state, action in self.parse_frames(filenames):
      yield {
          "frame_number": [frame_number],
          "frame": frame,
          "state": state,
          "action": action,
      }

Source File: yelp_full.py From tensor2tensor with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    yelp_dir = os.path.join(tmp_dir, "yelp_review_full_csv")
    if not tf.gfile.Exists(yelp_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }

Source File: fsns.py From fine-lm with MIT License

6 votes

def generate_data(self, data_dir, tmp_dir, task_id=-1):
    list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                "street/python/fsns_urls.txt")
    fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
                                               list_url)
    fsns_files = [
        f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
    ]
    for url in fsns_files:
      if "/train/train" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
      elif "/validation/validation" in url:
        generator_utils.maybe_download(
            data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
      elif "charset" in url:
        generator_utils.maybe_download(data_dir, "charset_size134.txt", url)

Source File: mrpc.py From tensor2tensor with Apache License 2.0

6 votes

def _maybe_download_corpora(self, tmp_dir):
    mrpc_dir = os.path.join(tmp_dir, self.DATA_DIR)
    tf.gfile.MakeDirs(mrpc_dir)
    mrpc_train_finalpath = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
    mrpc_test_finalpath = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
    mrpc_dev_ids_finalpath = os.path.join(mrpc_dir, "dev_ids.tsv")

    def download_file(tdir, filepath, url):
      if not tf.gfile.Exists(filepath):
        generator_utils.maybe_download(tdir, filepath, url)

    download_file(mrpc_dir, mrpc_train_finalpath, self.MRPC_TRAIN)
    download_file(mrpc_dir, mrpc_test_finalpath, self.MRPC_TEST)
    download_file(mrpc_dir, mrpc_dev_ids_finalpath, self.DEV_IDS)

    return mrpc_dir

Source File: cnn_dailymail.py From fine-lm with MIT License

6 votes

def write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training):
  """Write text to files."""

  def write_to_file(all_files, urls_path, tmp_dir, filename):
    with io.open(os.path.join(tmp_dir, filename + ".source"), "w") as fstory:
      with io.open(os.path.join(tmp_dir, filename + ".target"),
                   "w") as fsummary:
        for example in example_generator(all_files, urls_path, sum_token=True):
          story, summary = _story_summary_split(example)
          fstory.write(story + "\n")
          fsummary.write(summary + "\n")

  filename = "cnndm.train" if is_training else "cnndm.dev"
  tf.logging.info("Writing %s" % filename)
  write_to_file(all_files, urls_path, tmp_dir, filename)

  if not is_training:
    test_urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt",
                                                    _TEST_URLS)
    filename = "cnndm.test"
    tf.logging.info("Writing %s" % filename)
    write_to_file(all_files, test_urls_path, tmp_dir, filename)

Source File: yelp_full.py From BERT with Apache License 2.0

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    yelp_dir = os.path.join(tmp_dir, "yelp_review_full_csv")
    if not tf.gfile.Exists(yelp_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(yelp_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }

Source File: style_transfer.py From fine-lm with MIT License

6 votes

def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file = os.path.join(tmp_dir, tag + ".modern")
    target_file = os.path.join(tmp_dir, tag + ".original")
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)

Source File: cifar.py From BERT with Apache License 2.0

5 votes

def _get_cifar(directory, url):
  """Download and extract CIFAR to directory unless it is there."""
  filename = os.path.basename(url)
  path = generator_utils.maybe_download(directory, filename, url)
  tarfile.open(path, "r:gz").extractall(directory)

Source File: qnli.py From BERT with Apache License 2.0

5 votes

def _maybe_download_corpora(self, tmp_dir):
    qnli_filename = "QNLI.zip"
    qnli_finalpath = os.path.join(tmp_dir, "QNLI")
    if not tf.gfile.Exists(qnli_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, qnli_filename, self._QNLI_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return qnli_finalpath

Source File: lambada.py From tensor2tensor with Apache License 2.0

5 votes

def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename):
  """Downloading and preparing the dataset.

  Args:
    tmp_dir: tem directory
    data_dir: data directory
    vocab_size: size of vocabulary
    vocab_filename: name of vocab file

  """

  if not tf.gfile.Exists(data_dir):
    tf.gfile.MakeDirs(data_dir)

  file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
  tar_all = tarfile.open(file_path)
  tar_all.extractall(tmp_dir)
  tar_all.close()
  tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar"))
  tar_train.extractall(tmp_dir)
  tar_train.close()

  vocab_path = os.path.join(data_dir, vocab_filename)
  if not tf.gfile.Exists(vocab_path):
    with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile:
      reader = csv.reader(infile, delimiter="\t")
      words = [row[0] for row in reader]
      words = [_UNK] + words[:vocab_size]
    with tf.gfile.GFile(vocab_path, "w") as outfile:
      outfile.write("\n".join(words))

Source File: image_lsun.py From tensor2tensor with Apache License 2.0

5 votes

def _get_lsun(directory, category, split_name):
  """Downloads all lsun files to directory unless they are there."""
  generator_utils.maybe_download(directory,
                                 _LSUN_DATA_FILENAME % (category, split_name),
                                 _LSUN_URL % (category, split_name))

Source File: cola.py From tensor2tensor with Apache License 2.0

5 votes

def _maybe_download_corpora(self, tmp_dir):
    cola_filename = "CoLA.zip"
    cola_finalpath = os.path.join(tmp_dir, "CoLA")
    if not tf.gfile.Exists(cola_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, cola_filename, self._COLA_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return cola_finalpath

Source File: rte.py From tensor2tensor with Apache License 2.0

5 votes

def _maybe_download_corpora(self, tmp_dir):
    rte_filename = "RTE.zip"
    rte_finalpath = os.path.join(tmp_dir, "RTE")
    if not tf.gfile.Exists(rte_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, rte_filename, self._RTE_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return rte_finalpath

Source File: snli.py From tensor2tensor with Apache License 2.0

5 votes

def _download_and_parse_dataset(tmp_dir, train):
  """Downloads and prepairs the dataset to be parsed by the data_generator."""
  file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL)
  zip_ref = zipfile.ZipFile(file_path, 'r')
  zip_ref.extractall(tmp_dir)
  zip_ref.close()

  file_name = 'train' if train else 'dev'
  dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name)
  _parse_dataset(dataset_file_path, tmp_dir, train)

Source File: lm1b.py From tensor2tensor with Apache License 2.0

5 votes

def _maybe_download_corpus(tmp_dir):
  """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.
  """
  corpus_url = ("http://www.statmt.org/lm-benchmark/"
                "1-billion-word-language-modeling-benchmark-r13output.tar.gz")
  corpus_filename = os.path.basename(corpus_url)
  corpus_filepath = os.path.join(tmp_dir, corpus_filename)
  if not os.path.exists(corpus_filepath):
    generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
    with tarfile.open(corpus_filepath, "r:gz") as corpus_tar:
      corpus_tar.extractall(tmp_dir)

Source File: ptb.py From tensor2tensor with Apache License 2.0

5 votes

def _maybe_download_corpus(tmp_dir, vocab_type):
  """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_type: which vocabulary are we using.

  Returns:
    The list of names of files.
  """
  filename = os.path.basename(PTB_URL)
  compressed_filepath = generator_utils.maybe_download(
      tmp_dir, filename, PTB_URL)
  ptb_files = []
  ptb_char_files = []

  with tarfile.open(compressed_filepath, "r:gz") as tgz:
    files = []
    # Selecting only relevant files.
    for m in tgz.getmembers():
      if "ptb" in m.name and ".txt" in m.name:
        if "char" in m.name:
          ptb_char_files += [m.name]
        else:
          ptb_files += [m.name]
        files += [m]

    tgz.extractall(tmp_dir, members=files)

  if vocab_type == text_problems.VocabType.CHARACTER:
    return ptb_char_files
  else:
    return ptb_files

Source File: wiki.py From tensor2tensor with Apache License 2.0

5 votes

def maybe_prepare_text(self, tmp_dir):
    """Download corpus if necessary, decompress, split into multiple text files.

    Args:
      tmp_dir: directory containing dataset.

    Returns:
      list of filepaths for local text files.
    """
    compressed_filename = os.path.basename(self.corpus_url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    decompressed_filepath = compressed_filepath[:-4]
    split_file_prefix = decompressed_filepath + "-part-"
    split_filepattern = split_file_prefix + "?????"
    split_files = sorted(tf.gfile.Glob(split_filepattern))
    if not split_files:
      if not tf.gfile.Exists(decompressed_filepath):
        if not tf.gfile.Exists(compressed_filepath):
          generator_utils.maybe_download(
              tmp_dir, compressed_filepath, self.corpus_url)
        assert not subprocess.call(["bunzip2", compressed_filepath])
      assert tf.gfile.Exists(decompressed_filepath)
      assert not subprocess.call([
          "split", "--line-bytes=4M", "--suffix-length=5",
          "--numeric-suffixes", decompressed_filepath, split_file_prefix])
      split_files = sorted(tf.gfile.Glob(split_filepattern))
    assert split_files
    return split_files

Source File: quora_qpairs.py From tensor2tensor with Apache License 2.0

5 votes

def _maybe_download_corpora(self, tmp_dir):
    qqp_filename = "QQP.zip"
    qqp_finalpath = os.path.join(tmp_dir, "QQP")
    if not tf.gfile.Exists(qqp_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, qqp_filename, self._QQP_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return qqp_finalpath

Python tensor2tensor.data_generators.generator_utils.maybe_download() Examples