Python Examples of tensorflow.string

Source File: inference.py From fine-lm with MIT License

6 votes

def load_data(input_file, input_vocab):
  """Returns an iterator over the input file.

  Args:
    input_file: The input text file.
    input_vocab: The input vocabulary.

  Returns:
    A dataset batch iterator.
  """
  dataset = tf.data.TextLineDataset(input_file)
  dataset = dataset.map(lambda x: tf.string_split([x]).values)
  dataset = dataset.map(input_vocab.lookup)
  dataset = dataset.map(lambda x: {
      "ids": x,
      "length": tf.shape(x)[0]})
  dataset = dataset.padded_batch(64, {
      "ids": [None],
      "length": []})
  return dataset.make_initializable_iterator()

Source File: split_tokens_decoder.py From reaction_prediction_seq2seq with Apache License 2.0

6 votes

def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]

Source File: input_fn.py From professional-services with Apache License 2.0

6 votes

def parse_raw_text(sentence):
  """Splits text tensor by word to sparse sequence of tokens.

  Args:
    sentence: `tf.string`, with text record to split.

  Returns:
    Dictionary mapping feature name to tensors with the following entries
    `constants.TOKENS` mapping to a `SparseTensor` and
    `constants.SEQUENCE_LENGTH` mapping to a one-dimensional integer `Tensor`.

  """

  tokens = tf.regex_replace(sentence, _CHAR_TO_FILTER_OUT, ' ',
                            replace_global=True)
  sparse_sequence = tf.string_split(tokens)
  features = {
      constants.TOKENS: sparse_sequence,
      constants.SEQUENCE_LENGTH: get_sparse_tensor_size(sparse_sequence)
  }
  return features

Source File: content.py From ConMask with MIT License

6 votes

def multiple_content_lookup(content, vocab_table, ids, name=None):
    """

    :param content:
    :param vocab_table:
    :param ids:
    :param name:
    :return: 2-D [batch_size, max_length_in_batch] content id matrix,
             1-D [batch_size] content len vector
    """
    with tf.name_scope(name, 'multiple_content_lookup', [content, vocab_table, ids]):
        content_list = tf.nn.embedding_lookup(content, ids)

        extracted_sparse_content = tf.string_split(content_list, delimiter=' ')

        sparse_content = tf.SparseTensor(indices=extracted_sparse_content.indices,
                                         values=vocab_table.lookup(extracted_sparse_content.values),
                                         dense_shape=extracted_sparse_content.dense_shape)

        extracted_content_ids = tf.sparse_tensor_to_dense(sparse_content,
                                                          default_value=0, name='dense_content')
        extracted_content_len = tf.reduce_sum(tf.cast(tf.not_equal(extracted_content_ids, 0), tf.int32), axis=-1)

        return extracted_content_ids, extracted_content_len

Source File: content.py From ConMask with MIT License

6 votes

def entity_content_embedding_lookup(entities, content, content_len, vocab_table, word_embedding, str_pad, name=None):
    """ Lookup entity word embeddings given a flatten 1-D entity id list and content lookup table

    :param entities: Must be a 1-D entity vector
    :param content:
    :param content_len:
    :param vocab_table:
    :param word_embedding:
    :param str_pad:
    :param name:
    :return:
    """
    with tf.device('/cpu:0'):
        with tf.name_scope(name, 'entity_content_lookup',
                           [entities, content, content_len, vocab_table, word_embedding]):
            ent_content = tf.string_split(tf.nn.embedding_lookup(content, entities, name='ent_content'), delimiter=' ')
            content_len = tf.nn.embedding_lookup(content_len, entities, name='ent_content_len')
            ent_content_dense = tf.sparse_tensor_to_dense(ent_content,
                                                          default_value=str_pad,
                                                          name='ent_content_dense')
            ent_embedding = tf.nn.embedding_lookup(word_embedding,
                                                   vocab_table.lookup(ent_content_dense,
                                                                      name='ent_content_ids'))

            return ent_embedding, content_len

Source File: corruption.py From ConMask with MIT License

6 votes

def get_true_tails(ent_rel_str, targets_lookup_table, targets, name=None):
    """
    Given ent \t rel pair return a list of string targets
    :param ent_rel_str:
    :param targets_lookup_table:
    :param name:
    :return:
    """
    with tf.name_scope(name, 'get_true_tails', [ent_rel_str, targets_lookup_table, targets]):
        target_entities_lookup_id = targets_lookup_table.lookup(ent_rel_str)
        # CHECK IF WE HAVE -1 HERE, if so the error will be have a -2 that is out of the range
        target_entities_lookup_id = tf.where(tf.equal(target_entities_lookup_id, -1),
                                             target_entities_lookup_id - 1,
                                             target_entities_lookup_id)
        # sparseTensor
        str_targets = tf.string_split(tf.nn.embedding_lookup(targets, target_entities_lookup_id), delimiter=' ')
        return str_targets.values

Source File: split_tokens_decoder.py From natural-language-summary-generation-from-structured-data with MIT License

6 votes

def decode(self, data, items):
    decoded_items = {}

    # Split tokens
    tokens = tf.string_split([data], delimiter=self.delimiter).values

    # Optionally prepend a special token
    if self.prepend_token is not None:
      tokens = tf.concat([[self.prepend_token], tokens], 0)

    # Optionally append a special token
    if self.append_token is not None:
      tokens = tf.concat([tokens, [self.append_token]], 0)

    decoded_items[self.length_feature_name] = tf.size(tokens)
    decoded_items[self.tokens_feature_name] = tokens
    return [decoded_items[_] for _ in items]

Source File: data_util.py From reading_comprehension_tf with Apache License 2.0

6 votes

def generate_word_feat(sentence,
                       word_vocab_index,
                       word_max_length,
                       word_pad,
                       word_sos,
                       word_eos,
                       word_placeholder_enable):
    """generate word feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(word_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(word_pad, shape=[word_max_length])], axis=0)
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    words = tf.cast(word_vocab_index.lookup(words), dtype=tf.int32)
    words = tf.expand_dims(words, axis=-1)
    
    return words

Source File: data_util.py From reading_comprehension_tf with Apache License 2.0

6 votes

def create_trg_dataset(input_dataset,
                       input_data_type,
                       word_vocab_index,
                       word_max_length,
                       word_pad,
                       word_sos,
                       word_eos,
                       word_placeholder_enable,
                       num_parallel):
    """create dataset for input target data"""
    dataset = input_dataset
    
    if input_data_type == "span":
        dataset = dataset.map(lambda span: tf.string_split([span], delimiter='|').values, num_parallel_calls=num_parallel)
        dataset = dataset.map(lambda span: tf.string_to_number(span, out_type=tf.int32), num_parallel_calls=num_parallel)
        dataset = dataset.map(lambda span: tf.expand_dims(span, axis=-1), num_parallel_calls=num_parallel)
    elif input_data_type == "text":
        dataset = dataset.map(lambda sent: generate_word_feat(sent,
            word_vocab_index, word_max_length, word_pad, word_sos, word_eos,
            word_placeholder_enable), num_parallel_calls=num_parallel)
    
    return dataset

Source File: word2vec.py From tensorflow_nlp with Apache License 2.0

6 votes

def read_word_freq(filename):
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    lines = tf.string_split([value], "\n")

    with tf.Session() as sess:
        # Start populating the filename queue.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        sess.run([lines])
        lines_eval = lines.eval()
        result = []
        for line in lines_eval.values:
            s = line.split()
            result.append((s[0], int(s[1])))
        coord.request_stop()
        coord.join(threads)
    return result

Source File: utils.py From conv-ensemble-str with Apache License 2.0

6 votes

def get_label(self, text, null_character=u'\u2591'):
    """ Returns the ids of the corresponding text,

        Args:
          text: a tensor with shape [batch_size, lexicon_size]
                         and type string
          null_character: a unicode character used to replace '<null>'
          character. the default value is a light shade block '░'.
    """
    batch_size = text.shape[0].value
    lexicon_size = text.shape[1].value
    text = tf.reshape(text, [-1])
    sp_text = tf.string_split(text, delimiter='')
    sp_text = tf.sparse_reset_shape(sp_text, [batch_size*lexicon_size,
                                              self.max_sequence_length])
    sp_text = tf.sparse_tensor_to_dense(sp_text, default_value=null_character)
    ids = self.invert_table.lookup(sp_text)
    ids = tf.reshape(ids, [batch_size, lexicon_size, self.max_sequence_length])
    return tf.to_int32(ids)

Source File: string_split_op_test.py From deep_image_model with Apache License 2.0

6 votes

def testStringSplitWithDelimiterTensor(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      delimiter = tf.placeholder(tf.string)

      tokens = tf.string_split(strings, delimiter=delimiter)

      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a", "b"]})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: ["a"]})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(tokens, feed_dict={delimiter: "abc"})
      indices, values, shape = sess.run(tokens, feed_dict={delimiter: "|"})

      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2])

Source File: string_split_op_test.py From deep_image_model with Apache License 2.0

6 votes

def testStringSplitWithDelimiter(self):
    strings = ["hello|world", "hello world"]

    with self.test_session() as sess:
      self.assertRaises(
          ValueError, tf.string_split, strings, delimiter="delimiter")

      self.assertRaises(
          ValueError, tf.string_split, strings, delimiter=["|", ""])

      self.assertRaises(ValueError, tf.string_split, strings, delimiter=["a"])

      tokens = tf.string_split(strings, delimiter="|")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
      self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
      self.assertAllEqual(shape, [2, 2])

Source File: dataset.py From Document-Transformer with BSD 3-Clause "New" or "Revised" License

5 votes

def get_inference_input_ctx(inputs, ctxs, params):
    with tf.device("/cpu:0"):
        dataset = tf.data.Dataset.from_tensor_slices(
            tf.constant(inputs)
        )

        # Split string
        dataset = dataset.map(lambda x: tf.string_split([x]).values,
                              num_parallel_calls=params.num_threads)

        # Append <eos>
        dataset = dataset.map(
            lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
            num_parallel_calls=params.num_threads
        )

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda x: {"source": x, "source_length": tf.shape(x)[0]},
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.decode_batch_size * len(params.device_list),
            {"source": [tf.Dimension(None)], "source_length": []},
            {"source": params.pad, "source_length": 0}
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])

        return features

Source File: dataset.py From Document-Transformer with BSD 3-Clause "New" or "Revised" License

5 votes

def get_inference_input(inputs, params):
    with tf.device("/cpu:0"):
        dataset = tf.data.Dataset.from_tensor_slices(
            tf.constant(inputs)
        )

        # Split string
        dataset = dataset.map(lambda x: tf.string_split([x]).values,
                              num_parallel_calls=params.num_threads)

        # Append <eos>
        dataset = dataset.map(
            lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0),
            num_parallel_calls=params.num_threads
        )

        # Convert tuple to dictionary
        dataset = dataset.map(
            lambda x: {"source": x, "source_length": tf.shape(x)[0]},
            num_parallel_calls=params.num_threads
        )

        dataset = dataset.padded_batch(
            params.decode_batch_size * len(params.device_list),
            {"source": [tf.Dimension(None)], "source_length": []},
            {"source": params.pad, "source_length": 0}
        )

        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()

        src_table = tf.contrib.lookup.index_table_from_tensor(
            tf.constant(params.vocabulary["source"]),
            default_value=params.mapping["source"][params.unk]
        )
        features["source"] = src_table.lookup(features["source"])

        return features

Source File: data_util.py From reading_comprehension_tf with Apache License 2.0

5 votes

def generate_subword_feat(sentence,
                          subword_vocab_index,
                          word_max_length,
                          subword_max_length,
                          subword_size,
                          word_sos,
                          word_eos,
                          word_placeholder_enable,
                          subword_pad):
    def word_to_subword(word):
        """generate subwords for word"""
        word_len = tf.size(tf.string_split([word], delimiter=''))
        subwords = tf.substr([word], 0, subword_size)
        for i in range(1, subword_max_length):
            subwords = tf.cond(i+subword_size-1 < word_len,
                lambda: tf.concat([subwords, tf.substr([word], i, subword_size)], 0),
                lambda: subwords)
        
        subwords = tf.concat([subwords[:subword_max_length],
            tf.constant(subword_pad, shape=[subword_max_length])], axis=0)
        subwords = tf.reshape(subwords[:subword_max_length], shape=[subword_max_length])
        
        return subwords
    
    """generate subword feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(subword_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(subword_pad, shape=[word_max_length])], axis=0)
    
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    word_subwords = tf.map_fn(word_to_subword, words)
    word_subwords = tf.cast(subword_vocab_index.lookup(word_subwords), dtype=tf.int32)
    
    return word_subwords

Source File: dataset_utils.py From TwinGAN with Apache License 2.0

5 votes

def tensors_to_item(self, keys_to_tensors):
    unmapped_tensor = super(OneHotLabelTensor, self).tensors_to_item(keys_to_tensors)
    labels_text_split = tf.string_split([unmapped_tensor], delimiter=self._delimiter)
    tensor = self._table.lookup(labels_text_split.values)
    tensor = util_misc.safe_one_hot_encoding(tensor, self._num_classes, dtype=self._dtype)
    return tensor

#####################
# tf example parser #
#####################
# tf example parser functions. Some are taken from the tensorflow object detection repo.

Source File: data_util.py From reading_comprehension_tf with Apache License 2.0

5 votes

def generate_char_feat(sentence,
                       char_vocab_index,
                       word_max_length,
                       char_max_length,
                       word_sos,
                       word_eos,
                       word_placeholder_enable,
                       char_pad):
    def word_to_char(word):
        """generate chars for word"""
        chars = tf.string_split([word], delimiter='').values
        chars = tf.concat([chars[:char_max_length],
            tf.constant(char_pad, shape=[char_max_length])], axis=0)
        chars = tf.reshape(chars[:char_max_length], shape=[char_max_length])
        
        return chars
    
    """generate char feature for sentence"""
    words = tf.string_split([sentence], delimiter=' ').values
    if word_placeholder_enable == True:
        words = tf.concat([[word_sos], words[:word_max_length], [word_eos],
            tf.constant(char_pad, shape=[word_max_length])], axis=0)
        word_max_length = word_max_length + 2
    else:
        words = tf.concat([words[:word_max_length],
            tf.constant(char_pad, shape=[word_max_length])], axis=0)
    
    words = tf.reshape(words[:word_max_length], shape=[word_max_length])
    word_chars = tf.map_fn(word_to_char, words)
    word_chars = tf.cast(char_vocab_index.lookup(word_chars), dtype=tf.int32)
    
    return word_chars

Source File: tf_example_decoder.py From aster with MIT License

5 votes

def _split_lexicon(self, keys_to_tensors):
    joined_lexicon = keys_to_tensors[fields.TfExampleFields.lexicon]
    lexicon_sparse = tf.string_split([joined_lexicon], delimiter='\t')
    lexicon = tf.sparse_tensor_to_dense(lexicon_sparse, default_value='')[0]
    return lexicon

Source File: tokenizeddata.py From ChatLearner with Apache License 2.0

5 votes

def get_inference_batch(self, src_dataset):
        text_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)

        if self.hparams.src_max_len_infer:
            text_dataset = text_dataset.map(lambda src: src[:self.hparams.src_max_len_infer])
        # Convert the word strings to ids
        id_dataset = text_dataset.map(lambda src: tf.cast(self.vocab_table.lookup(src),
                                                          tf.int32))
        if self.hparams.source_reverse:
            id_dataset = id_dataset.map(lambda src: tf.reverse(src, axis=[0]))
        # Add in the word counts.
        id_dataset = id_dataset.map(lambda src: (src, tf.size(src)))

        def batching_func(x):
            return x.padded_batch(
                self.hparams.batch_size_infer,
                # The entry is the source line rows; this has unknown-length vectors.
                # The last entry is the source row size; this is a scalar.
                padded_shapes=(tf.TensorShape([None]),  # src
                               tf.TensorShape([])),     # src_len
                # Pad the source sequences with eos tokens. Though notice we don't generally need to
                # do this since later on we will be masking out calculations past the true sequence.
                padding_values=(self.hparams.eos_id,  # src
                                0))                   # src_len -- unused

        id_dataset = batching_func(id_dataset)

        infer_iter = id_dataset.make_initializable_iterator()
        (src_ids, src_seq_len) = infer_iter.get_next()

        return BatchedInput(initializer=infer_iter.initializer,
                            source=src_ids,
                            target_input=None,
                            target_output=None,
                            source_sequence_length=src_seq_len,
                            target_sequence_length=None)

Source File: logistic_regression.py From tf-encrypted with Apache License 2.0

5 votes

def provide_data(self):
        def decode(line):
            fields = tf.string_split([line], self.field_delim).values
            if self.index:  # Skip index
                fields = fields[1:]
            fields = tf.regex_replace(fields, "|".join(self.na_values), "nan")
            fields = tf.string_to_number(fields, tf.float32)
            return fields

        def fill_na(fields, fill_values):
            fields = tf.where(tf.is_nan(fields), fill_values, fields)
            return fields

        dataset = tf.data.TextLineDataset(self.local_data_file)
        if self.header:  # Skip header
            dataset = dataset.skip(1)
        dataset = (
            dataset.map(decode)
            .map(lambda x: fill_na(x, self.data_schema.field_defaults))
            .repeat()
            .batch(self.batch_size)
        )

        iterator = dataset.make_one_shot_iterator()
        batch = iterator.get_next()
        batch = tf.reshape(batch, [self.batch_size, self.data_schema.field_num])
        return batch

Source File: aby3.py From tf-encrypted with Apache License 2.0

5 votes

def _read_(prot, filename_prefix, batch_size, n_columns):

    row_shape = [n_columns]

    def decode(line):
        fields = tf.string_split([line], ",").values
        fields = tf.strings.to_number(fields, tf.int64)
        fields = tf.reshape(fields, row_shape)
        return fields

    batch = [[None] * 2 for _ in range(3)]
    for i in range(3):
        with tf.device(prot.servers[i].device_name):
            for j in range(2):
                data = (
                    tf.data.TFRecordDataset(
                        ["{}_share{}{}".format(filename_prefix, i, j)]
                    )
                    .map(decode)
                    .repeat()
                    .batch(batch_size=batch_size)
                )
                it = data.make_one_shot_iterator()
                batch[i][j] = it.get_next()
                batch[i][j] = tf.reshape(batch[i][j], [batch_size] + row_shape)
                batch[i][j] = prot.int_factory.tensor(batch[i][j])

    return ABY3PrivateTensor(prot, batch, True, ARITHMETIC)

Source File: data.py From tf_examples with Apache License 2.0

5 votes

def make_input_fn(mode, filename_in, filename_out, in_vocab_file, out_vocab_file, batch_size, vocab_size,
                  input_max_length, output_max_length, queue_capacity=10000, num_threads=10):
    def input_fn():
        num_epochs = None if mode == tf.estimator.ModeKeys.TRAIN else 1
        filename_in_queue = tf.train.string_input_producer(
            [filename_in], num_epochs=num_epochs)
        filename_out_queue = tf.train.string_input_producer(
            [filename_out], num_epochs=num_epochs)
        reader_in = tf.TextLineReader()
        reader_out = tf.TextLineReader()
        in_list, out_list = [], []
        for _ in range(num_threads):
            in_list.append(reader_in.read(filename_in_queue)[1])
            out_list.append(reader_out.read(filename_out_queue)[1])
        tensor_in = reader_in.read(filename_in_queue)[1]
        tensor_out = reader_out.read(filename_out_queue)[1]
        if mode == tf.estimator.ModeKeys.TRAIN:
            inputs, outputs = tf.train.shuffle_batch(
                (tensor_in, tensor_out), batch_size, capacity=queue_capacity,
                min_after_dequeue=batch_size * 3,
                enqueue_many=True
            )
        else:
            inputs, outputs = tf.train.batch(
                (tensor_in, tensor_out), batch_size, capacity=queue_capacity,
                allow_smaller_final_batch=True)

        # Preprocess inputs.
        inputs = utils.sparse_to_dense_trim(tf.string_split(inputs), output_shape=[batch_size, input_max_length], default_value='<\S>')
        outputs = utils.sparse_to_dense_trim(tf.string_split(outputs), output_shape=[batch_size, output_max_length], default_value='<\S>')
        tf.identity(inputs[0], name='inputs')
        tf.identity(outputs[0], name='outputs')
        in_vocab = tf.contrib.lookup.index_table_from_file(in_vocab_file, vocab_size=vocab_size, default_value=2)
        input_ids = in_vocab.lookup(inputs)
        out_vocab = tf.contrib.lookup.index_table_from_file(out_vocab_file, vocab_size=vocab_size, default_value=2)
        output_ids = out_vocab.lookup(outputs)
        return {'inputs': inputs_ids, 'outputs': outputs_ids}, None
    return input_fn

Source File: main.py From NAO with GNU General Public License v3.0

5 votes

def predict_input_fn(predict_from_file):
  dataset = tf.data.TextLineDataset(predict_from_file)
  def decode_record(record):
    src = tf.string_split([record]).values
    src = tf.string_to_number(src, out_type=tf.int32)
    return src, tf.constant([SOS], dtype=tf.int32)
  dataset = dataset.map(decode_record)
  dataset = dataset.batch(FLAGS.batch_size)
  iterator = dataset.make_one_shot_iterator()
  inputs, targets_inputs = iterator.get_next()
  assert inputs.shape.ndims == 2
  return inputs, targets_inputs

Source File: 2_adanet_avazu.py From deep-learning-note with MIT License

5 votes

def generator(ln):
    splits = tf.string_split([ln], delimiter=',')
    label = splits.values[0]
    # 解析 dense 部分
    features = {}
    for i in range(1, 14):
        features['I'+str(i)] = tf.string_to_number(splits.values[i], tf.int64)

    return features, label

Source File: decoder_main.py From NAO with GNU General Public License v3.0

5 votes

def predict_from_file(estimator, batch_size, decode_from_file, decode_to_file=None):
  def infer_input_fn():
    sos_id = tf.constant([SOS], dtype=tf.int32)
    dataset = tf.data.TextLineDataset(decode_from_file)
    def decode_record(record):
      src = tf.string_split([record]).values
      src = tf.string_to_number(src, out_type=tf.float32)
      return src, tf.constant([SOS], dtype=tf.int32)
    dataset = dataset.map(decode_record)
    dataset = dataset.batch(FLAGS.batch_size)
    iterator = dataset.make_one_shot_iterator()
    inputs, targets_inputs = iterator.get_next()
    assert inputs.shape.ndims == 2
    #assert targets_inputs.shape.ndims == 2
    
    return {
      'inputs' : inputs, 
      'targets_inputs' : targets_inputs,
      'targets' : None,
    }, None

  results = []
  result_iter = estimator.predict(infer_input_fn)
  for result in result_iter:
    output = result['output'].flatten()
    output = ' '.join(map(str, output))
    tf.logging.info('Inference results OUTPUT: %s' % output)
    results.append(output)

  if decode_to_file:
    output_filename = decode_to_file
  else:
    output_filename = '%s.result' % decode_from_file
    
  tf.logging.info('Writing results into {0}'.format(output_filename))
  with tf.gfile.Open(output_filename, 'w') as f:
    for res in results:
      f.write('%s\n' % (res))

Source File: string_split_op_test.py From deep_image_model with Apache License 2.0

5 votes

def testStringSplitEmptyToken(self):
    strings = [" hello ", "", "world "]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [2, 0]])
      self.assertAllEqual(values, [b"hello", b"world"])
      self.assertAllEqual(shape, [3, 1])

Source File: string_split_op_test.py From deep_image_model with Apache License 2.0

5 votes

def testStringSplitEmptyDelimiter(self):
    strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E

    with self.test_session() as sess:
      tokens = tf.string_split(strings, delimiter="")
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                    [1, 0], [1, 1], [1, 2], [1, 3],
                                    [2, 0], [2, 1], [2, 2], [2, 3]])
      expected = np.array(
          ['h', 'e', 'l', 'l', 'o', 'h', 'o', 'l',
           'a', b'\xf0', b'\x9f', b'\x98', b'\x8e'], dtype='|S1')
      self.assertAllEqual(values.tolist(), expected)
      self.assertAllEqual(shape, [3, 5])

Source File: string_split_op_test.py From deep_image_model with Apache License 2.0

5 votes

def testStringSplit(self):
    strings = ["pigs on the wing", "animals"]

    with self.test_session() as sess:
      tokens = tf.string_split(strings)
      indices, values, shape = sess.run(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
      self.assertAllEqual(shape, [2, 4])

Source File: preprocessors.py From mead-baseline with Apache License 2.0

5 votes

def lowercase(self, raw_post):
        split_chars = tf.string_split(tf.reshape(raw_post, [-1]), delimiter="").values
        upchar_inds = self.upchars_lut.lookup(split_chars)
        return tf.reduce_join(tf.map_fn(lambda x: tf.cond(x[0] > 25,
                                                          lambda: x[1],
                                                          lambda: self.lchars[x[0]]),
                                        (upchar_inds, split_chars), dtype=tf.string))

Python tensorflow.string_split() Examples