Python tensorflow.python.ops.lookup_ops.index_table_from_file() Examples

The following are 27 code examples of tensorflow.python.ops.lookup_ops.index_table_from_file(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.python.ops.lookup_ops , or try the search function .
Example #1
Source File: vocab_utils.py    From inference with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #2
Source File: feature_column.py    From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License 5 votes vote down vote up
def _transform_feature(self, inputs):
    input_tensor = _to_sparse_input(inputs.get(self.key))

    if self.dtype.is_integer != input_tensor.dtype.is_integer:
      raise ValueError(
          'Column dtype and SparseTensors dtype must be compatible. '
          'key: {}, column dtype: {}, tensor dtype: {}'.format(
              self.key, self.dtype, input_tensor.dtype))

    _assert_string_or_int(
        input_tensor.dtype,
        prefix='column_name: {} input_tensor'.format(self.key))

    key_dtype = self.dtype
    if input_tensor.dtype.is_integer:
      # `index_table_from_file` requires 64-bit integer keys.
      key_dtype = dtypes.int64
      input_tensor = math_ops.to_int64(input_tensor)

    return lookup_ops.index_table_from_file(
        vocabulary_file=self.vocabulary_file,
        num_oov_buckets=self.num_oov_buckets,
        vocab_size=self.vocabulary_size,
        default_value=self.default_value,
        key_dtype=key_dtype,
        name='{}_lookup'.format(self.key)).lookup(input_tensor) 
Example #3
Source File: vocab_utils.py    From active-qa with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #4
Source File: vocab_utils.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
    """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
    src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=UNK_ID)
    if share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID)
    return src_vocab_table, tgt_vocab_table 
Example #5
Source File: qe_model.py    From qebrain with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab, vocab_size):
    src_vocab_table = lookup_ops.index_table_from_file(
        src_vocab_file, default_value=vocab_size)
    if share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=vocab_size)
    return src_vocab_table, tgt_vocab_table 
Example #6
Source File: expert_model.py    From qebrain with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab, vocab_size):
    src_vocab_table = lookup_ops.index_table_from_file(
        src_vocab_file, default_value=vocab_size)
    if share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=vocab_size)
    return src_vocab_table, tgt_vocab_table 
Example #7
Source File: decoder_main.py    From NAO with GNU General Public License v3.0 5 votes vote down vote up
def create_vocab_tables(vocab_file):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  vocab_table = lookup_ops.index_table_from_file(
      vocab_file, default_value=0)
  return vocab_table 
Example #8
Source File: vocab_utils.py    From nmt with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #9
Source File: vocab.py    From THRED with MIT License 5 votes vote down vote up
def create_vocab_table(vocab_file):
    """Creates vocab tables for vocab_file."""
    return lookup_ops.index_table_from_file(vocab_file, default_value=UNK_ID) 
Example #10
Source File: feature_column.py    From lambda-packs with MIT License 5 votes vote down vote up
def _transform_feature(self, inputs):
    input_tensor = _to_sparse_input(inputs.get(self.key))

    if self.dtype.is_integer != input_tensor.dtype.is_integer:
      raise ValueError(
          'Column dtype and SparseTensors dtype must be compatible. '
          'key: {}, column dtype: {}, tensor dtype: {}'.format(
              self.key, self.dtype, input_tensor.dtype))

    _assert_string_or_int(
        input_tensor.dtype,
        prefix='column_name: {} input_tensor'.format(self.key))

    key_dtype = self.dtype
    if input_tensor.dtype.is_integer:
      # `index_table_from_file` requires 64-bit integer keys.
      key_dtype = dtypes.int64
      input_tensor = math_ops.to_int64(input_tensor)

    return lookup_ops.index_table_from_file(
        vocabulary_file=self.vocabulary_file,
        num_oov_buckets=self.num_oov_buckets,
        vocab_size=self.vocabulary_size,
        default_value=self.default_value,
        key_dtype=key_dtype,
        name='{}_lookup'.format(self.key)).lookup(input_tensor) 
Example #11
Source File: tokenizeddata.py    From ChatLearner with Apache License 2.0 5 votes vote down vote up
def __init__(self, corpus_dir, hparams=None, training=True, buffer_size=8192):
        """
        Args:
            corpus_dir: Name of the folder storing corpus files for training.
            hparams: The object containing the loaded hyper parameters. If None, it will be 
                    initialized here.
            training: Whether to use this object for training.
            buffer_size: The buffer size used for mapping process during data processing.
        """
        if hparams is None:
            self.hparams = HParams(corpus_dir).hparams
        else:
            self.hparams = hparams

        self.src_max_len = self.hparams.src_max_len
        self.tgt_max_len = self.hparams.tgt_max_len

        self.training = training
        self.text_set = None
        self.id_set = None

        vocab_file = os.path.join(corpus_dir, VOCAB_FILE)
        self.vocab_size, _ = check_vocab(vocab_file)
        self.vocab_table = lookup_ops.index_table_from_file(vocab_file,
                                                            default_value=self.hparams.unk_id)
        # print("vocab_size = {}".format(self.vocab_size))

        if training:
            self.case_table = prepare_case_table()
            self.reverse_vocab_table = None
            self._load_corpus(corpus_dir)
            self._convert_to_tokens(buffer_size)
        else:
            self.case_table = None
            self.reverse_vocab_table = \
                lookup_ops.index_to_string_table_from_file(vocab_file,
                                                           default_value=self.hparams.unk_token) 
Example #12
Source File: model_helper.py    From LSTM-CNN-CWS with Apache License 2.0 5 votes vote down vote up
def create_infer_model(hparams, model_creator):
  """Create inference model."""
  graph = tf.Graph()
  vocab_file = hparams.vocab_file

  with graph.as_default(), tf.container("infer"):
    vocab_table = lookup_ops.index_table_from_file(
      vocab_file, default_value = UNK_ID)
    # for the labels
    '''
    Although this is nonsense for the inference procedure, this is to ensure
    the labels are not None when building the model graph.
    (refer to model.BasicModel._decode_layer)
    '''
    mapping_strings = tf.constant(['0'])
    index_table = tf.contrib.lookup.index_table_from_tensor(
    mapping = mapping_strings, default_value = 0)

    txt_placeholder = tf.placeholder(shape=[None], dtype = tf.string)
    batch_size_placeholder = tf.placeholder(shape = [], dtype = tf.int64)

    txt_dataset = tf.data.Dataset.from_tensor_slices(
        txt_placeholder)
    iterator = data_iterator.get_infer_iterator(
        txt_dataset,
        vocab_table,
        index_table,
        batch_size = batch_size_placeholder)

    model = model_creator(
        hparams,
        iterator = iterator,
        mode = tf.contrib.learn.ModeKeys.INFER,
        vocab_table = vocab_table)

  return InferModel(
      graph = graph,
      model = model,
      txt_placeholder = txt_placeholder,
      batch_size_placeholder = batch_size_placeholder,
      iterator = iterator) 
Example #13
Source File: model_helper.py    From LSTM-CNN-CWS with Apache License 2.0 5 votes vote down vote up
def create_eval_model(hparams, model_creator):
  vocab_file = hparams.vocab_file
  index_file = hparams.index_file
  graph = tf.Graph()

  with graph.as_default(), tf.container("eval"):
    vocab_table = lookup_ops.index_table_from_file(
      vocab_file, default_value = UNK_ID)
    # for the labels
    index_table = lookup_ops.index_table_from_file(
      index_file, default_value = 0)

    # the file's name
    txt_file_placeholder = tf.placeholder(shape = (), dtype = tf.string)
    lb_file_placeholder = tf.placeholder(shape = (), dtype = tf.string)
    txt_dataset = tf.data.TextLineDataset(txt_file_placeholder)
    lb_dataset = tf.data.TextLineDataset(lb_file_placeholder)

    iterator = data_iterator.get_iterator(
        txt_dataset,
        lb_dataset,
        vocab_table,
        index_table,
        batch_size = hparams.batch_size,
        num_buckets = hparams.num_buckets,
        max_len = hparams.max_len)

    model = model_creator(
        hparams,
        iterator = iterator,
        mode = tf.contrib.learn.ModeKeys.EVAL,
        vocab_table = vocab_table)

  return EvalModel(
      graph = graph,
      model = model,
      txt_file_placeholder = txt_file_placeholder,
      lb_file_placeholder = lb_file_placeholder,
      iterator = iterator) 
Example #14
Source File: model_helper.py    From LSTM-CNN-CWS with Apache License 2.0 5 votes vote down vote up
def create_train_model(hparams, model_creator):
  txt_file = "%s.%s" % (hparams.train_prefix, "txt")
  lb_file = "%s.%s" % (hparams.train_prefix, "lb")
  vocab_file = hparams.vocab_file
  index_file = hparams.index_file

  graph = tf.Graph()

  with graph.as_default(), tf.container("train"):
    vocab_table = lookup_ops.index_table_from_file(
      vocab_file, default_value = UNK_ID)
    # for the labels
    index_table = lookup_ops.index_table_from_file(
      index_file, default_value = 0)

    txt_dataset = tf.data.TextLineDataset(txt_file)
    lb_dataset = tf.data.TextLineDataset(lb_file)

    iterator = data_iterator.get_iterator(
        txt_dataset,
        lb_dataset,
        vocab_table,
        index_table,
        batch_size = hparams.batch_size,
        num_buckets = hparams.num_buckets,
        max_len = hparams.max_len)

    model = model_creator(
        hparams,
        iterator = iterator,
        mode = tf.contrib.learn.ModeKeys.TRAIN,
        vocab_table = vocab_table)

  return TrainModel(
      graph = graph,
      model = model,
      iterator = iterator) 
Example #15
Source File: vocab_utils.py    From parallax with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #16
Source File: vocab_utils.py    From NETransliteration-COLING2018 with MIT License 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #17
Source File: vocab_utils.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #18
Source File: vocab_utils.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #19
Source File: vocab_utils.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #20
Source File: vocab_utils.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #21
Source File: vocab_utils.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table 
Example #22
Source File: vocab_utils.py    From nslt with Apache License 2.0 5 votes vote down vote up
def create_tgt_vocab_table(tgt_vocab_file):
    """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
    tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID)

    return tgt_vocab_table 
Example #23
Source File: train_bahdanau.py    From NLP with MIT License 4 votes vote down vote up
def create_input_data(source_data_file, target_data_file,
                      vocab_file,
                      batch_size, sos, eos, unk_id,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  vocab = lookup_ops.index_table_from_file(vocab_file, default_value=unk_id)

  output_buffer_size = batch_size * 1000

  sos_id = tf.cast(vocab.lookup(tf.constant(sos)), tf.int32)
  eos_id = tf.cast(vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  # dataset = dataset.map(
  #   lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) <= source_max_length, tf.size(tgt) <= target_max_length))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(vocab.lookup(src), tf.int32),
                      tf.cast(vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (src,
                      tf.concat(([sos_id], tgt), 0),
                      tf.concat((tgt, [eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(eos_id,
                    eos_id,
                    eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, vocab

# ======================== SEQ2SEQ NETWORK ============================= 
Example #24
Source File: train_bahdanau.py    From NLP with MIT License 4 votes vote down vote up
def create_input_data(source_data_file, target_data_file,
                      source_vocab_file, target_vocab_file,
                      batch_size, sos, eos, unk_id,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  source_vocab = lookup_ops.index_table_from_file(
    source_vocab_file, default_value=unk_id)
  target_vocab = lookup_ops.index_table_from_file(
    target_vocab_file, default_value=unk_id)

  output_buffer_size = batch_size * 1000

  source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32)
  target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32)
  target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  dataset = dataset.map(
    lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32),
                      tf.cast(target_vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (src,
                      tf.concat(([target_sos_id], tgt), 0),
                      tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(source_eos_id,
                    target_eos_id,
                    target_eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, source_vocab, target_vocab

# ======================== SEQ2SEQ NETWORK ============================= 
Example #25
Source File: train_bi.py    From NLP with MIT License 4 votes vote down vote up
def create_input_data(source_data_file, target_data_file,
                      source_vocab_file, target_vocab_file,
                      batch_size, sos, eos,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  source_vocab = lookup_ops.index_table_from_file(
    source_vocab_file, default_value=FLAGS.unk_id)
  target_vocab = lookup_ops.index_table_from_file(
    target_vocab_file, default_value=FLAGS.unk_id)

  output_buffer_size = batch_size * 1000

  source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32)
  target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32)
  target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  dataset = dataset.map(
    lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32),
                      tf.cast(target_vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (src,
                      tf.concat(([target_sos_id], tgt), 0),
                      tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(source_eos_id,
                    target_eos_id,
                    target_eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, source_vocab, target_vocab

# ======================== SEQ2SEQ NETWORK ============================= 
Example #26
Source File: train_luong.py    From NLP with MIT License 4 votes vote down vote up
def create_input_data(source_data_file, target_data_file,
                      source_vocab_file, target_vocab_file,
                      batch_size, sos, eos, unk_id,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  source_vocab = lookup_ops.index_table_from_file(
    source_vocab_file, default_value=unk_id)
  target_vocab = lookup_ops.index_table_from_file(
    target_vocab_file, default_value=unk_id)

  output_buffer_size = batch_size * 1000

  source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32)
  target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32)
  target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  dataset = dataset.map(
    lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32),
                      tf.cast(target_vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.reverse(src, axis=[0]),
                      tf.concat(([target_sos_id], tgt), 0),
                      tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(source_eos_id,
                    target_eos_id,
                    target_eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, source_vocab, target_vocab

# ======================== SEQ2SEQ NETWORK ============================= 
Example #27
Source File: train.py    From NLP with MIT License 4 votes vote down vote up
def create_input_data(source_data_file, target_data_file,
                      source_vocab_file, target_vocab_file,
                      batch_size, unk_id, sos, eos,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  source_vocab = lookup_ops.index_table_from_file(
    source_vocab_file, default_value=unk_id)
  target_vocab = lookup_ops.index_table_from_file(
    target_vocab_file, default_value=unk_id)

  output_buffer_size = batch_size * 1000

  source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32)
  target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32)
  target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  dataset = dataset.map(
    lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32),
                      tf.cast(target_vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.reverse(src, axis=[0]),
                      tf.concat(([target_sos_id], tgt), 0),
                      tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(source_eos_id,
                    target_eos_id,
                    target_eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, source_vocab, target_vocab

# ======================== SEQ2SEQ NETWORK =============================