Python bert.tokenization.convert_to_unicode() Examples
The following are 30
code examples of bert.tokenization.convert_to_unicode().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bert.tokenization
, or try the search function
.
Example #1
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #2
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #3
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #4
Source File: run_classifier.py From QGforQA with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) if set_type == "test": label = "contradiction" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #5
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #6
Source File: run_classifier_distillation.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] sentence_index = 0 for (i, line) in enumerate(lines): if i == 0: # Identify the missing index for j, token in enumerate(line): if token.strip() == "sentence": sentence_index = j continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[sentence_index]) label = [1.0, 0] else: text_a = tokenization.convert_to_unicode(line[sentence_index]) label = [float(line[2]), float(line[3])] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #7
Source File: classifier_utils.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] print("length of lines:", len(lines)) for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) try: label = tokenization.convert_to_unicode(line[2]) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) except Exception: # pylint: disable=broad-except print("###error.i:", i, line) return examples
Example #8
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #9
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #10
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) if set_type == "test": label = "contradiction" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #11
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #12
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) label = tokenization.convert_to_unicode(line[5]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #13
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = "0" else: text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[3]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #14
Source File: run_concat_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[0]) label = "0" else: text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #15
Source File: extract_features.py From Bert-TextClassification with MIT License | 6 votes |
def read_examples(lst_strs): """Read a list of `InputExample`s from a list of strings.""" unique_id = 0 for ss in lst_strs: line = tokenization.convert_to_unicode(ss) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1
Example #16
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #17
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #18
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #19
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) label = tokenization.convert_to_unicode(line[5]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #20
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = "0" else: text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[3]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #21
Source File: run_classifier.py From language with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[0]) label = "0" else: text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #22
Source File: run_text_classification.py From pynlp with MIT License | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #23
Source File: run_text_classification.py From pynlp with MIT License | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #24
Source File: run_text_classification.py From pynlp with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #25
Source File: run_text_classification.py From pynlp with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #26
Source File: run_text_classification.py From BERT-for-Sequence-Labeling-and-Text-Classification with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #27
Source File: run_text_classification.py From BERT-for-Sequence-Labeling-and-Text-Classification with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #28
Source File: run_text_classification.py From BERT-for-Sequence-Labeling-and-Text-Classification with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
Example #29
Source File: run_text_classification.py From BERT-for-Sequence-Labeling-and-Text-Classification with Apache License 2.0 | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
Example #30
Source File: run_TCM.py From MedicalRelationExtraction with MIT License | 6 votes |
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if len(line) != 4: print(line) guid = "%s-%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]), str(i)) target = tokenization.convert_to_unicode(line[1]) text = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[3]) examples.append( InputExample(guid=guid, text_a=target, text_b=text, label=label)) # print(guid, target, text, label) # exit() return examples