Python bert.tokenization.printable_text() Examples
The following are 26
code examples of bert.tokenization.printable_text().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bert.tokenization
, or try the search function
.
Example #1
Source File: triviaqa_document_utils.py From RE3QA with Apache License 2.0 | 6 votes |
def __repr__(self): s = "" s += "document_id: %s" % (self.document_id) s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: %s ..." % (" ".join(self.doc_tokens[:20])) s += ", length of doc_tokens: %d" % (len(self.doc_tokens)) if self.orig_answer_texts: s += ", orig_answer_texts: {}".format(self.orig_answer_texts) if self.start_positions and self.end_positions: s += ", start_positions: {}".format(self.start_positions) s += ", end_positions: {}".format(self.end_positions) s += ", token_answer: " for start, end in zip(self.start_positions, self.end_positions): s += "{}, ".format(" ".join(self.doc_tokens[start:(end+1)])) return s
Example #2
Source File: run_bert_open_qa_eval.py From XQA with MIT License | 5 votes |
def __repr__(self): s = "" s += "id: %s" % (self.qid) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) s += ", answer_text: %s" % (self.orig_answer_text) return s
Example #3
Source File: squad_document_utils.py From RE3QA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "document_id: %s" % (self.document_id) s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens[:20])) s += ", length of doc_tokens: [%d]" % (len(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.end_position: s += ", end_position: %d" % (self.end_position) return s
Example #4
Source File: squad_document_utils.py From RE3QA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) return s
Example #5
Source File: squad_open_utils.py From RE3QA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += "doc_index: %d" % (self.doc_index) s += "para_index: %d" % (self.para_index) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) if self.answer_texts is not None: s += ", answer_texts: ".format(self.answer_texts) return s
Example #6
Source File: squad_utils.py From RE3QA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) return s
Example #7
Source File: run_squad.py From MAX-Question-Answering with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #8
Source File: utils.py From SpanABSA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" # s += "example_id: %s" % (tokenization.printable_text(self.example_id)) s += ", sent_tokens: [%s]" % (" ".join(self.sent_tokens)) if self.term_texts: s += ", term_texts: {}".format(self.term_texts) # if self.start_positions: # s += ", start_positions: {}".format(self.start_positions) # if self.end_positions: # s += ", end_positions: {}".format(self.end_positions) if self.polarities: s += ", polarities: {}".format(self.polarities) return s
Example #9
Source File: squad_utils.py From SpanABSA with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) return s
Example #10
Source File: run_dualencoder_lsf.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #11
Source File: run_dualencoder_qa.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #12
Source File: create_tfrecords.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #13
Source File: answer_extractor.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #14
Source File: run_squad.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #15
Source File: preprocessing_utils.py From language with Apache License 2.0 | 5 votes |
def __str__(self): s = "" for sent in self.tokens: s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in sent])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "\n" return s
Example #16
Source File: preprocessing_utils.py From language with Apache License 2.0 | 5 votes |
def __str__(self): s = "" for sent in self.tokens[0]: s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in sent])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids[0]])) s += "\n" return s
Example #17
Source File: run_squad_membership.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.label_id: s += ", membership label_id: %d" % (self.label_id) return s
Example #18
Source File: run_squad.py From language with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #19
Source File: create_pretraining_data.py From QGforQA with MIT License | 5 votes |
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
Example #20
Source File: test_squad.py From QGforQA with MIT License | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
Example #21
Source File: drop_utils.py From MTMSN with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", \nquestion: %s" % (" ".join(self.question_tokens)) s += ", \npassage: %s" % (" ".join(self.passage_tokens)) if self.numbers_in_passage: s += ", \nnumbers_in_passage: {}".format(self.numbers_in_passage) if self.number_indices: s += ", \nnumber_indices: {}".format(self.number_indices) if self.answer_type: s += ", \nanswer_type: {}".format(self.answer_type) if self.number_of_answer: s += ", \nnumber_of_answer: {}".format(self.number_of_answer) if self.passage_spans: s += ", \npassage_spans: {}".format(self.passage_spans) if self.question_spans: s += ", \nquestion_spans: {}".format(self.question_spans) if self.add_sub_expressions: s += ", \nadd_sub_expressions: {}".format(self.add_sub_expressions) if self.counts: s += ", \ncounts: {}".format(self.counts) if self.negations: s += ", \nnegations: {}".format(self.negations) if self.answer_annotations: s += ", \nanswer_annotations: {}".format(self.answer_annotations) return s
Example #22
Source File: squad_utils.py From MTMSN with Apache License 2.0 | 5 votes |
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) return s
Example #23
Source File: create_pretraining_data.py From causal-text-embeddings with MIT License | 5 votes |
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
Example #24
Source File: run_bert_open_qa_train.py From XQA with MIT License | 5 votes |
def __repr__(self): s = "" s += "id: %s" % (self.qid) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_positions: %s" % (self.start_position) if self.start_position: s += ", end_positions: %s" % (self.end_position) return s
Example #25
Source File: input_fns.py From language with Apache License 2.0 | 4 votes |
def convert_examples_to_features(examples, tokenizer, max_query_length, entity2id, output_fn): """Loads a data file into a list of `InputBatch`s.""" for (example_index, example) in tqdm(enumerate(examples)): qry_input_ids, qry_input_mask, qry_tokens = get_tokens_and_mask( example.question_text, tokenizer, max_query_length) relation_input_ids, relation_input_mask = [], [] if example.relations is not None: for relation in example.relations: rel_input_ids, rel_input_mask, _ = get_tokens_and_mask( relation, tokenizer, max_query_length) relation_input_ids.append(rel_input_ids) relation_input_mask.append(rel_input_mask) if example_index < 20: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s", example.qas_id) tf.logging.info( "qry_tokens: %s", " ".join([tokenization.printable_text(x) for x in qry_tokens])) tf.logging.info("qry_input_ids: %s", " ".join([str(x) for x in qry_input_ids])) tf.logging.info("qry_input_mask: %s", " ".join([str(x) for x in qry_input_mask])) for ii in range(len(relation_input_ids)): tf.logging.info("relation_input_ids_%d: %s", ii, " ".join([str(x) for x in relation_input_ids[ii]])) tf.logging.info("relation_input_mask_%d: %s", ii, " ".join([str(x) for x in relation_input_mask[ii]])) tf.logging.info("qry_entity_id: %s (%d)", example.subject_entity[0], entity2id.get(example.subject_entity[0], None)) tf.logging.info("answer entity: %s", str(example.answer_entity)) feature = InputFeatures( qas_id=example.qas_id.encode("utf-8"), qry_tokens=qry_tokens, qry_input_ids=qry_input_ids, qry_input_mask=qry_input_mask, relation_input_ids=relation_input_ids, relation_input_mask=relation_input_mask, qry_entity_id=[entity2id.get(ee, 0) for ee in example.subject_entity], answer_mention=example.answer_mention, answer_entity=example.answer_entity, bridge_mention=example.bridge_mention, bridge_entity=example.bridge_entity) # Run callback output_fn(feature)
Example #26
Source File: loader.py From text_bert_cnn with MIT License | 4 votes |
def convert_examples_to_features(examples,label_list, max_seq_length,tokenizer): """ 将所有的InputExamples样本数据转化成模型要输入的token形式,最后输出bert模型需要的四个变量; input_ids:就是text_a(分类文本)在词库对应的token,按字符级; input_mask:bert模型mask训练的标记,都为1; segment_ids:句子标记,此场景只有text_a,都为0; label_ids:文本标签对应的token,不是one_hot的形式; """ label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i input_data=[] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 3: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) features = collections.OrderedDict() features["input_ids"] = input_ids features["input_mask"] = input_mask features["segment_ids"] = segment_ids features["label_ids"] =label_id input_data.append(features) return input_data