Python data_utils.EOS_ID Examples
The following are 30
code examples of data_utils.EOS_ID().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
data_utils
, or try the search function
.
Example #1
Source File: baseline.py From ecm with Apache License 2.0 | 6 votes |
def read_data(path, max_size=None): data_set = [[] for _ in _buckets] data = json.load(open(path,'r')) counter = 0 size_max = 0 for pair in data: post = pair[0] responses = pair[1] source_ids = [int(x) for x in post[0]] for response in responses: if not max_size or counter < max_size: counter += 1 if counter % 100000 == 0: print(" reading data pair %d" % counter) sys.stdout.flush() target_ids = [int(x) for x in response[0]] target_ids.append(data_utils.EOS_ID) #size_max = len(source_ids) if len(source_ids) > size_max else size_max #size_max = len(target_ids) if len(target_ids) > size_max else size_max for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids, int(post[1]), int(response[1])]) break return data_set
Example #2
Source File: translate.py From transliteration with Apache License 2.0 | 6 votes |
def run(self, sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, self.en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = self.model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = self.model.step(self.sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. return "".join([self.rev_fr_vocab[output] for output in outputs])
Example #3
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #4
Source File: translate.py From object_detection_kitti with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #5
Source File: translate.py From hands-detection with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #6
Source File: translate.py From ChatBotCourse with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 10 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #7
Source File: translate.py From transliteration with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] diffs = [] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) diffs.append(source_size - len(source_ids) + target_size - len(target_ids)) break source, target = source_file.readline(), target_file.readline() #print("mean padding count: %f" % np.mean(diffs)) return data_set
Example #8
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #9
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #10
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #11
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #12
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #13
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #14
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #15
Source File: translate.py From DeepAffinity with GNU General Public License v3.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] mycount=0 with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) mycount=mycount+1 break source, target = source_file.readline(), target_file.readline() print(mycount) return data_set
Example #16
Source File: translate.py From object_detection_with_tensorflow with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #17
Source File: transliterate.py From deep-trans with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 10000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #18
Source File: execute.py From deep-news-summarization with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 1000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #19
Source File: translate.py From HumanRecognition with MIT License | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #20
Source File: translate.py From yolo_v2 with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #21
Source File: translate.py From DOTA_models with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #22
Source File: translate.py From HumanRecognition with MIT License | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
Example #23
Source File: translate.py From object_detection_with_tensorflow with MIT License | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
Example #24
Source File: run_multi-task_rnn.py From pynlp with MIT License | 4 votes |
def read_data(source_path, target_path, label_path, max_size=None): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the word sequence. target_path: path to the file with token-ids for the tag sequence; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. label_path: path to the file with token-ids for the intent label max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target, label) tuple read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1];source, target, label are lists of token-ids """ data_set = [[] for _ in _buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: with tf.gfile.GFile(label_path, mode="r") as label_file: source = source_file.readline() target = target_file.readline() label = label_file.readline() counter = 0 while source and target and label and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] label_ids = [int(x) for x in label.split()] # target_ids.append(data_utils.EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids, label_ids]) break source = source_file.readline() target = target_file.readline() label = label_file.readline() return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids
Example #25
Source File: parser.py From DeepDeepParser with Apache License 2.0 | 4 votes |
def read_mrs_data(buckets, source_paths, target_paths, max_size=None, any_length=False, offset_target=-1): # Read in all files seperately. source_inputs = [data_utils.read_ids_file(path, max_size) for path in source_paths] target_inputs = [data_utils.read_ids_file(path, max_size) for path in target_paths] data_set = [[] for _ in buckets] data_list = [] # Assume everything is well-aligned. for i in xrange(len(source_inputs[0])): # over examples # List of sequences of each type. source_ids = [source_input[i] for source_input in source_inputs] # Assume first target type predicts EOS. # Not checking pointer ranges: do that inside tf graph. target_ids = [target_inputs[0][i] + [data_utils.EOS_ID]] for j, target_input in enumerate(target_inputs[1:]): if offset_target > 0 and j + 1 == offset_target: target_ids.append([data_utils.PAD_ID] + target_input[i] + [data_utils.PAD_ID]) else: target_ids.append(target_input[i] + [data_utils.PAD_ID]) found_bucket = False for bucket_id, (source_size, target_size) in enumerate(buckets): if len(source_ids[0]) < source_size and len(target_ids[0]) < target_size: data_set[bucket_id].append([source_ids, target_ids]) data_list.append([source_ids, target_ids, bucket_id]) found_bucket = True break if any_length and not found_bucket: # Crop examples that are larger than the largest bucket. source_size, target_size = buckets[-1][0], buckets[-1][1] if len(source_ids[0]) >= source_size: source_ids = [source_id[:source_size] for source_id in source_ids] if len(target_ids[0]) >= target_size: target_ids = [target_id[:target_size] for target_id in target_ids] bucket_id = len(buckets) - 1 data_set[bucket_id].append([source_ids, target_ids]) data_list.append([source_ids, target_ids, bucket_id]) return data_set, data_list
Example #26
Source File: translate.py From object_detection_kitti with Apache License 2.0 | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
Example #27
Source File: translate.py From hands-detection with MIT License | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
Example #28
Source File: translate.py From ChatBotCourse with MIT License | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.input" % FLAGS.input_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.output" % FLAGS.output_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
Example #29
Source File: transliterate.py From deep-trans with MIT License | 4 votes |
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one word at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) hn_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.hn" % FLAGS.hn_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_hn_vocab = data_utils.initialize_vocabulary(hn_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() word = sys.stdin.readline() while word: word = word.lower() char_list_new = list(word) word = " ".join(char_list_new) # Get token-ids for the input word. token_ids = data_utils.word_to_token_ids(tf.compat.as_bytes(word), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the word to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the word. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out Hindi word corresponding to outputs. print("".join([tf.compat.as_str(rev_hn_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() word = sys.stdin.readline()
Example #30
Source File: execute.py From deep-news-summarization with MIT License | 4 votes |
def decode_input(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then the bucket length. bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()