Python Examples of bert.tokenization.printable

Source File: triviaqa_document_utils.py From RE3QA with Apache License 2.0

6 votes

def __repr__(self):
        s = ""
        s += "document_id: %s" % (self.document_id)
        s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: %s ..." % (" ".join(self.doc_tokens[:20]))
        s += ", length of doc_tokens: %d" % (len(self.doc_tokens))
        if self.orig_answer_texts:
            s += ", orig_answer_texts: {}".format(self.orig_answer_texts)
        if self.start_positions and self.end_positions:
            s += ", start_positions: {}".format(self.start_positions)
            s += ", end_positions: {}".format(self.end_positions)
            s += ", token_answer: "
            for start, end in zip(self.start_positions, self.end_positions):
                s += "{}, ".format(" ".join(self.doc_tokens[start:(end+1)]))
        return s

Source File: run_bert_open_qa_eval.py From XQA with MIT License

5 votes

def __repr__(self):
    s = ""
    s += "id: %s" % (self.qid)
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    s += ", answer_text: %s" % (self.orig_answer_text)
    return s

Source File: squad_document_utils.py From RE3QA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "document_id: %s" % (self.document_id)
        s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens[:20]))
        s += ", length of doc_tokens: [%d]" % (len(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.end_position:
            s += ", end_position: %d" % (self.end_position)
        return s

Source File: squad_document_utils.py From RE3QA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s

Source File: squad_open_utils.py From RE3QA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += "doc_index: %d" % (self.doc_index)
        s += "para_index: %d" % (self.para_index)
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        if self.answer_texts is not None:
            s += ", answer_texts: ".format(self.answer_texts)
        return s

Source File: squad_utils.py From RE3QA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s

Source File: run_squad.py From MAX-Question-Answering with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        if self.start_position:
            s += ", is_impossible: %r" % (self.is_impossible)
        return s

Source File: utils.py From SpanABSA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        # s += "example_id: %s" % (tokenization.printable_text(self.example_id))
        s += ", sent_tokens: [%s]" % (" ".join(self.sent_tokens))
        if self.term_texts:
            s += ", term_texts: {}".format(self.term_texts)
        # if self.start_positions:
        #     s += ", start_positions: {}".format(self.start_positions)
        # if self.end_positions:
        #     s += ", end_positions: {}".format(self.end_positions)
        if self.polarities:
            s += ", polarities: {}".format(self.polarities)
        return s

Source File: squad_utils.py From SpanABSA with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s

Source File: run_dualencoder_lsf.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: run_dualencoder_qa.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: create_tfrecords.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: answer_extractor.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: run_squad.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: preprocessing_utils.py From language with Apache License 2.0

5 votes

def __str__(self):
    s = ""
    for sent in self.tokens:
      s += "tokens: %s\n" % (" ".join(
          [tokenization.printable_text(x) for x in sent]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "\n"
    return s

Source File: preprocessing_utils.py From language with Apache License 2.0

5 votes

def __str__(self):
    s = ""
    for sent in self.tokens[0]:
      s += "tokens: %s\n" % (" ".join(
          [tokenization.printable_text(x) for x in sent]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids[0]]))
    s += "\n"
    return s

Source File: run_squad_membership.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.label_id:
      s += ", membership label_id: %d" % (self.label_id)
    return s

Source File: run_squad.py From language with Apache License 2.0

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: create_pretraining_data.py From QGforQA with MIT License

5 votes

def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s

Source File: test_squad.py From QGforQA with MIT License

5 votes

def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s

Source File: drop_utils.py From MTMSN with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", \nquestion: %s" % (" ".join(self.question_tokens))
        s += ", \npassage: %s" % (" ".join(self.passage_tokens))
        if self.numbers_in_passage:
            s += ", \nnumbers_in_passage: {}".format(self.numbers_in_passage)
        if self.number_indices:
            s += ", \nnumber_indices: {}".format(self.number_indices)
        if self.answer_type:
            s += ", \nanswer_type: {}".format(self.answer_type)
        if self.number_of_answer:
            s += ", \nnumber_of_answer: {}".format(self.number_of_answer)
        if self.passage_spans:
            s += ", \npassage_spans: {}".format(self.passage_spans)
        if self.question_spans:
            s += ", \nquestion_spans: {}".format(self.question_spans)
        if self.add_sub_expressions:
            s += ", \nadd_sub_expressions: {}".format(self.add_sub_expressions)
        if self.counts:
            s += ", \ncounts: {}".format(self.counts)
        if self.negations:
            s += ", \nnegations: {}".format(self.negations)
        if self.answer_annotations:
            s += ", \nanswer_annotations: {}".format(self.answer_annotations)
        return s

Source File: squad_utils.py From MTMSN with Apache License 2.0

5 votes

def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s

Source File: create_pretraining_data.py From causal-text-embeddings with MIT License

5 votes

def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s

Source File: run_bert_open_qa_train.py From XQA with MIT License

5 votes

def __repr__(self):
    s = ""
    s += "id: %s" % (self.qid)
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_positions: %s" % (self.start_position)
    if self.start_position:
      s += ", end_positions: %s" % (self.end_position)
    return s

Source File: input_fns.py From language with Apache License 2.0

4 votes

def convert_examples_to_features(examples, tokenizer, max_query_length,
                                 entity2id, output_fn):
  """Loads a data file into a list of `InputBatch`s."""
  for (example_index, example) in tqdm(enumerate(examples)):
    qry_input_ids, qry_input_mask, qry_tokens = get_tokens_and_mask(
        example.question_text, tokenizer, max_query_length)
    relation_input_ids, relation_input_mask = [], []
    if example.relations is not None:
      for relation in example.relations:
        rel_input_ids, rel_input_mask, _ = get_tokens_and_mask(
            relation, tokenizer, max_query_length)
        relation_input_ids.append(rel_input_ids)
        relation_input_mask.append(rel_input_mask)
    if example_index < 20:
      tf.logging.info("*** Example ***")
      tf.logging.info("unique_id: %s", example.qas_id)
      tf.logging.info(
          "qry_tokens: %s",
          " ".join([tokenization.printable_text(x) for x in qry_tokens]))
      tf.logging.info("qry_input_ids: %s",
                      " ".join([str(x) for x in qry_input_ids]))
      tf.logging.info("qry_input_mask: %s",
                      " ".join([str(x) for x in qry_input_mask]))
      for ii in range(len(relation_input_ids)):
        tf.logging.info("relation_input_ids_%d: %s", ii,
                        " ".join([str(x) for x in relation_input_ids[ii]]))
        tf.logging.info("relation_input_mask_%d: %s", ii,
                        " ".join([str(x) for x in relation_input_mask[ii]]))
      tf.logging.info("qry_entity_id: %s (%d)", example.subject_entity[0],
                      entity2id.get(example.subject_entity[0], None))
      tf.logging.info("answer entity: %s", str(example.answer_entity))

    feature = InputFeatures(
        qas_id=example.qas_id.encode("utf-8"),
        qry_tokens=qry_tokens,
        qry_input_ids=qry_input_ids,
        qry_input_mask=qry_input_mask,
        relation_input_ids=relation_input_ids,
        relation_input_mask=relation_input_mask,
        qry_entity_id=[entity2id.get(ee, 0) for ee in example.subject_entity],
        answer_mention=example.answer_mention,
        answer_entity=example.answer_entity,
        bridge_mention=example.bridge_mention,
        bridge_entity=example.bridge_entity)

    # Run callback
    output_fn(feature)

Source File: loader.py From text_bert_cnn with MIT License

4 votes

def convert_examples_to_features(examples,label_list, max_seq_length,tokenizer):
    """
    将所有的InputExamples样本数据转化成模型要输入的token形式，最后输出bert模型需要的四个变量；
    input_ids：就是text_a(分类文本)在词库对应的token，按字符级；
    input_mask：bert模型mask训练的标记，都为1；
    segment_ids：句子标记，此场景只有text_a,都为0；
    label_ids：文本标签对应的token，不是one_hot的形式；
    """
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    input_data=[]
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]
        if ex_index < 3:
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

        features = collections.OrderedDict()
        features["input_ids"] = input_ids
        features["input_mask"] = input_mask
        features["segment_ids"] = segment_ids
        features["label_ids"] =label_id
        input_data.append(features)

    return input_data

Python bert.tokenization.printable_text() Examples