Python tensorflow.python.ops.lookup_ops.index_table_from_file() Examples
The following are 27
code examples of tensorflow.python.ops.lookup_ops.index_table_from_file().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.ops.lookup_ops
, or try the search function
.
Example #1
Source File: vocab_utils.py From inference with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #2
Source File: feature_column.py From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License | 5 votes |
def _transform_feature(self, inputs): input_tensor = _to_sparse_input(inputs.get(self.key)) if self.dtype.is_integer != input_tensor.dtype.is_integer: raise ValueError( 'Column dtype and SparseTensors dtype must be compatible. ' 'key: {}, column dtype: {}, tensor dtype: {}'.format( self.key, self.dtype, input_tensor.dtype)) _assert_string_or_int( input_tensor.dtype, prefix='column_name: {} input_tensor'.format(self.key)) key_dtype = self.dtype if input_tensor.dtype.is_integer: # `index_table_from_file` requires 64-bit integer keys. key_dtype = dtypes.int64 input_tensor = math_ops.to_int64(input_tensor) return lookup_ops.index_table_from_file( vocabulary_file=self.vocabulary_file, num_oov_buckets=self.num_oov_buckets, vocab_size=self.vocabulary_size, default_value=self.default_value, key_dtype=key_dtype, name='{}_lookup'.format(self.key)).lookup(input_tensor)
Example #3
Source File: vocab_utils.py From active-qa with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #4
Source File: vocab_utils.py From nlp-architect with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #5
Source File: qe_model.py From qebrain with BSD 2-Clause "Simplified" License | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab, vocab_size): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=vocab_size) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=vocab_size) return src_vocab_table, tgt_vocab_table
Example #6
Source File: expert_model.py From qebrain with BSD 2-Clause "Simplified" License | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab, vocab_size): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=vocab_size) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=vocab_size) return src_vocab_table, tgt_vocab_table
Example #7
Source File: decoder_main.py From NAO with GNU General Public License v3.0 | 5 votes |
def create_vocab_tables(vocab_file): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" vocab_table = lookup_ops.index_table_from_file( vocab_file, default_value=0) return vocab_table
Example #8
Source File: vocab_utils.py From nmt with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #9
Source File: vocab.py From THRED with MIT License | 5 votes |
def create_vocab_table(vocab_file): """Creates vocab tables for vocab_file.""" return lookup_ops.index_table_from_file(vocab_file, default_value=UNK_ID)
Example #10
Source File: feature_column.py From lambda-packs with MIT License | 5 votes |
def _transform_feature(self, inputs): input_tensor = _to_sparse_input(inputs.get(self.key)) if self.dtype.is_integer != input_tensor.dtype.is_integer: raise ValueError( 'Column dtype and SparseTensors dtype must be compatible. ' 'key: {}, column dtype: {}, tensor dtype: {}'.format( self.key, self.dtype, input_tensor.dtype)) _assert_string_or_int( input_tensor.dtype, prefix='column_name: {} input_tensor'.format(self.key)) key_dtype = self.dtype if input_tensor.dtype.is_integer: # `index_table_from_file` requires 64-bit integer keys. key_dtype = dtypes.int64 input_tensor = math_ops.to_int64(input_tensor) return lookup_ops.index_table_from_file( vocabulary_file=self.vocabulary_file, num_oov_buckets=self.num_oov_buckets, vocab_size=self.vocabulary_size, default_value=self.default_value, key_dtype=key_dtype, name='{}_lookup'.format(self.key)).lookup(input_tensor)
Example #11
Source File: tokenizeddata.py From ChatLearner with Apache License 2.0 | 5 votes |
def __init__(self, corpus_dir, hparams=None, training=True, buffer_size=8192): """ Args: corpus_dir: Name of the folder storing corpus files for training. hparams: The object containing the loaded hyper parameters. If None, it will be initialized here. training: Whether to use this object for training. buffer_size: The buffer size used for mapping process during data processing. """ if hparams is None: self.hparams = HParams(corpus_dir).hparams else: self.hparams = hparams self.src_max_len = self.hparams.src_max_len self.tgt_max_len = self.hparams.tgt_max_len self.training = training self.text_set = None self.id_set = None vocab_file = os.path.join(corpus_dir, VOCAB_FILE) self.vocab_size, _ = check_vocab(vocab_file) self.vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=self.hparams.unk_id) # print("vocab_size = {}".format(self.vocab_size)) if training: self.case_table = prepare_case_table() self.reverse_vocab_table = None self._load_corpus(corpus_dir) self._convert_to_tokens(buffer_size) else: self.case_table = None self.reverse_vocab_table = \ lookup_ops.index_to_string_table_from_file(vocab_file, default_value=self.hparams.unk_token)
Example #12
Source File: model_helper.py From LSTM-CNN-CWS with Apache License 2.0 | 5 votes |
def create_infer_model(hparams, model_creator): """Create inference model.""" graph = tf.Graph() vocab_file = hparams.vocab_file with graph.as_default(), tf.container("infer"): vocab_table = lookup_ops.index_table_from_file( vocab_file, default_value = UNK_ID) # for the labels ''' Although this is nonsense for the inference procedure, this is to ensure the labels are not None when building the model graph. (refer to model.BasicModel._decode_layer) ''' mapping_strings = tf.constant(['0']) index_table = tf.contrib.lookup.index_table_from_tensor( mapping = mapping_strings, default_value = 0) txt_placeholder = tf.placeholder(shape=[None], dtype = tf.string) batch_size_placeholder = tf.placeholder(shape = [], dtype = tf.int64) txt_dataset = tf.data.Dataset.from_tensor_slices( txt_placeholder) iterator = data_iterator.get_infer_iterator( txt_dataset, vocab_table, index_table, batch_size = batch_size_placeholder) model = model_creator( hparams, iterator = iterator, mode = tf.contrib.learn.ModeKeys.INFER, vocab_table = vocab_table) return InferModel( graph = graph, model = model, txt_placeholder = txt_placeholder, batch_size_placeholder = batch_size_placeholder, iterator = iterator)
Example #13
Source File: model_helper.py From LSTM-CNN-CWS with Apache License 2.0 | 5 votes |
def create_eval_model(hparams, model_creator): vocab_file = hparams.vocab_file index_file = hparams.index_file graph = tf.Graph() with graph.as_default(), tf.container("eval"): vocab_table = lookup_ops.index_table_from_file( vocab_file, default_value = UNK_ID) # for the labels index_table = lookup_ops.index_table_from_file( index_file, default_value = 0) # the file's name txt_file_placeholder = tf.placeholder(shape = (), dtype = tf.string) lb_file_placeholder = tf.placeholder(shape = (), dtype = tf.string) txt_dataset = tf.data.TextLineDataset(txt_file_placeholder) lb_dataset = tf.data.TextLineDataset(lb_file_placeholder) iterator = data_iterator.get_iterator( txt_dataset, lb_dataset, vocab_table, index_table, batch_size = hparams.batch_size, num_buckets = hparams.num_buckets, max_len = hparams.max_len) model = model_creator( hparams, iterator = iterator, mode = tf.contrib.learn.ModeKeys.EVAL, vocab_table = vocab_table) return EvalModel( graph = graph, model = model, txt_file_placeholder = txt_file_placeholder, lb_file_placeholder = lb_file_placeholder, iterator = iterator)
Example #14
Source File: model_helper.py From LSTM-CNN-CWS with Apache License 2.0 | 5 votes |
def create_train_model(hparams, model_creator): txt_file = "%s.%s" % (hparams.train_prefix, "txt") lb_file = "%s.%s" % (hparams.train_prefix, "lb") vocab_file = hparams.vocab_file index_file = hparams.index_file graph = tf.Graph() with graph.as_default(), tf.container("train"): vocab_table = lookup_ops.index_table_from_file( vocab_file, default_value = UNK_ID) # for the labels index_table = lookup_ops.index_table_from_file( index_file, default_value = 0) txt_dataset = tf.data.TextLineDataset(txt_file) lb_dataset = tf.data.TextLineDataset(lb_file) iterator = data_iterator.get_iterator( txt_dataset, lb_dataset, vocab_table, index_table, batch_size = hparams.batch_size, num_buckets = hparams.num_buckets, max_len = hparams.max_len) model = model_creator( hparams, iterator = iterator, mode = tf.contrib.learn.ModeKeys.TRAIN, vocab_table = vocab_table) return TrainModel( graph = graph, model = model, iterator = iterator)
Example #15
Source File: vocab_utils.py From parallax with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #16
Source File: vocab_utils.py From NETransliteration-COLING2018 with MIT License | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #17
Source File: vocab_utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #18
Source File: vocab_utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #19
Source File: vocab_utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #20
Source File: vocab_utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #21
Source File: vocab_utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
Example #22
Source File: vocab_utils.py From nslt with Apache License 2.0 | 5 votes |
def create_tgt_vocab_table(tgt_vocab_file): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID) return tgt_vocab_table
Example #23
Source File: train_bahdanau.py From NLP with MIT License | 4 votes |
def create_input_data(source_data_file, target_data_file, vocab_file, batch_size, sos, eos, unk_id, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) vocab = lookup_ops.index_table_from_file(vocab_file, default_value=unk_id) output_buffer_size = batch_size * 1000 sos_id = tf.cast(vocab.lookup(tf.constant(sos)), tf.int32) eos_id = tf.cast(vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) # dataset = dataset.map( # lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) <= source_max_length, tf.size(tgt) <= target_max_length)) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(vocab.lookup(src), tf.int32), tf.cast(vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (src, tf.concat(([sos_id], tgt), 0), tf.concat((tgt, [eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(eos_id, eos_id, eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, vocab # ======================== SEQ2SEQ NETWORK =============================
Example #24
Source File: train_bahdanau.py From NLP with MIT License | 4 votes |
def create_input_data(source_data_file, target_data_file, source_vocab_file, target_vocab_file, batch_size, sos, eos, unk_id, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) source_vocab = lookup_ops.index_table_from_file( source_vocab_file, default_value=unk_id) target_vocab = lookup_ops.index_table_from_file( target_vocab_file, default_value=unk_id) output_buffer_size = batch_size * 1000 source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32) target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32) target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) dataset = dataset.map( lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32), tf.cast(target_vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (src, tf.concat(([target_sos_id], tgt), 0), tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(source_eos_id, target_eos_id, target_eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, source_vocab, target_vocab # ======================== SEQ2SEQ NETWORK =============================
Example #25
Source File: train_bi.py From NLP with MIT License | 4 votes |
def create_input_data(source_data_file, target_data_file, source_vocab_file, target_vocab_file, batch_size, sos, eos, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) source_vocab = lookup_ops.index_table_from_file( source_vocab_file, default_value=FLAGS.unk_id) target_vocab = lookup_ops.index_table_from_file( target_vocab_file, default_value=FLAGS.unk_id) output_buffer_size = batch_size * 1000 source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32) target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32) target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) dataset = dataset.map( lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32), tf.cast(target_vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (src, tf.concat(([target_sos_id], tgt), 0), tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(source_eos_id, target_eos_id, target_eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, source_vocab, target_vocab # ======================== SEQ2SEQ NETWORK =============================
Example #26
Source File: train_luong.py From NLP with MIT License | 4 votes |
def create_input_data(source_data_file, target_data_file, source_vocab_file, target_vocab_file, batch_size, sos, eos, unk_id, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) source_vocab = lookup_ops.index_table_from_file( source_vocab_file, default_value=unk_id) target_vocab = lookup_ops.index_table_from_file( target_vocab_file, default_value=unk_id) output_buffer_size = batch_size * 1000 source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32) target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32) target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) dataset = dataset.map( lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32), tf.cast(target_vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.reverse(src, axis=[0]), tf.concat(([target_sos_id], tgt), 0), tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(source_eos_id, target_eos_id, target_eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, source_vocab, target_vocab # ======================== SEQ2SEQ NETWORK =============================
Example #27
Source File: train.py From NLP with MIT License | 4 votes |
def create_input_data(source_data_file, target_data_file, source_vocab_file, target_vocab_file, batch_size, unk_id, sos, eos, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) source_vocab = lookup_ops.index_table_from_file( source_vocab_file, default_value=unk_id) target_vocab = lookup_ops.index_table_from_file( target_vocab_file, default_value=unk_id) output_buffer_size = batch_size * 1000 source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32) target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32) target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) dataset = dataset.map( lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32), tf.cast(target_vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.reverse(src, axis=[0]), tf.concat(([target_sos_id], tgt), 0), tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(source_eos_id, target_eos_id, target_eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, source_vocab, target_vocab # ======================== SEQ2SEQ NETWORK =============================