Python tokenization.convert_to_unicode() Examples
The following are 30
code examples of tokenization.convert_to_unicode().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tokenization
, or try the search function
.
Example #1
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #2
Source File: sentence_embedding.py From SOQAL with MIT License | 6 votes |
def read_examples(text): """return example format from string""" examples = [] unique_id = 0 text = text.replace('\n',' ') #remove line breaks line = tokenization.convert_to_unicode(text) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #3
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #4
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #5
Source File: run_classifier.py From text_bert_cnn with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #6
Source File: extract_features.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #7
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_train_examples(self, data_dir): examples = [] train_files = ["train.fold0.docterm_recall", "train.fold1.docterm_recall", "train.fold2.docterm_recall", "train.fold3.docterm_recall"] for file_name in train_files: train_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(train_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] if not term_recall_dict or not doc_text.strip(): continue guid = "train-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) train_file.close() random.shuffle(examples) return examples
Example #8
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_dev_examples(self, data_dir): dev_files = ["train.fold4.docterm_recall.firsthalf"] examples = [] for file_name in dev_files: dev_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(dev_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] guid = "dev-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) dev_file.close() return examples
Example #9
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_test_examples(self, data_dir): test_files = ["train.fold4.docterm_recall.secondhalf"] examples = [] for file_name in test_files: test_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(test_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] guid = "test-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) test_file.close() return examples
Example #10
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_train_examples(self, data_dir): examples = [] train_files = [data_dir] for file_name in train_files: train_file = open(file_name) for i, line in enumerate(train_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] guid = "train-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) train_file.close() random.shuffle(examples) return examples
Example #11
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_test_examples(self, data_dir): test_files = [data_dir] examples = [] for file_name in test_files: test_file = open(file_name) for i, line in enumerate(test_file): jdict = json.loads(line) docid = jdict["id"] doc_text = jdict[FLAGS.doc_field] doc_text = tokenization.convert_to_unicode(doc_text) doc_text = truncate_and_clean_trec_19_doc(doc_text, FLAGS.max_body_length) term_recall_dict = {} if not doc_text.strip(): doc_text = '.' # tf.logging.info("skipping {}".format(docid)) # continue guid = "test-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) test_file.close() return examples
Example #12
Source File: run_deepct.py From DeepCT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_test_examples(self, data_dir): test_files = [data_dir] examples = [] for file_name in test_files: test_file = open(file_name) for i, line in enumerate(test_file): jdict = json.loads(line) docid = jdict["id"] doc_text = jdict["content"] doc_text = tokenization.convert_to_unicode(doc_text) term_recall_dict = {} if not doc_text.strip(): doc_text = '.' guid = "test-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) test_file.close() return examples
Example #13
Source File: run_classifier.py From text_bert_cnn with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) if set_type == "test": label = "contradiction" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #14
Source File: extract_features.py From models with Apache License 2.0 | 6 votes |
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.io.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #15
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #16
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #17
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #18
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #19
Source File: extract_features.py From models with Apache License 2.0 | 6 votes |
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.io.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #20
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #21
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) if set_type == "test": label = "contradiction" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #22
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #23
Source File: run_classifier.py From models with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #24
Source File: extract_features.py From models with Apache License 2.0 | 6 votes |
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.io.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #25
Source File: run_classifier.py From BERT with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #26
Source File: run_classifier.py From BERT with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #27
Source File: extract_features.py From BERT with Apache License 2.0 | 6 votes |
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
Example #28
Source File: run_classifier.py From wsdm19cup with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #29
Source File: run_classifier.py From wsdm19cup with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) if set_type == "test": label = "contradiction" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #30
Source File: run_classifier.py From wsdm19cup with MIT License | 6 votes |
def get_test_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "test-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples