Python Examples of bert.tokenization.FullTokenizer

Source File: tokenization_test.py From QGforQA with MIT License

6 votes

def test_full_tokenizer(self):
    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
      if six.PY2:
        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
      else:
        vocab_writer.write("".join(
            [x + "\n" for x in vocab_tokens]).encode("utf-8"))

      vocab_file = vocab_writer.name

    tokenizer = tokenization.FullTokenizer(vocab_file)
    os.unlink(vocab_file)

    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

    self.assertAllEqual(
        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

Source File: server.py From Bert-TextClassification with MIT License

6 votes

def __init__(self, id, args, worker_address, sink_address):
        super().__init__()
        self.model_dir = args.model_dir
        self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
        self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
        self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
        self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
        self.max_seq_len = args.max_seq_len
        self.worker_id = id
        self.daemon = True
        self.model_fn = model_fn_builder(
            bert_config=modeling.BertConfig.from_json_file(self.config_fp),
            init_checkpoint=self.checkpoint_fp,
            pooling_strategy=args.pooling_strategy,
            pooling_layer=args.pooling_layer
        )
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
        self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config))
        self.exit_flag = multiprocessing.Event()
        self.logger = set_logger('WORKER-%d' % self.worker_id)
        self.worker_address = worker_address
        self.sink_address = sink_address

Source File: tokenization_test.py From Bert-TextClassification with MIT License

6 votes

def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

Source File: bert_example.py From lasertagger with Apache License 2.0

6 votes

def __init__(self, label_map, vocab_file,
               max_seq_length, do_lower_case,
               converter):
    """Initializes an instance of BertExampleBuilder.

    Args:
      label_map: Mapping from tags to tag IDs.
      vocab_file: Path to BERT vocabulary file.
      max_seq_length: Maximum sequence length.
      do_lower_case: Whether to lower case the input text. Should be True for
        uncased models and False for cased models.
      converter: Converter from text targets to tags.
    """
    self._label_map = label_map
    self._tokenizer = tokenization.FullTokenizer(vocab_file,
                                                 do_lower_case=do_lower_case)
    self._max_seq_length = max_seq_length
    self._converter = converter
    self._pad_id = self._get_pad_id()
    self._keep_tag_id = self._label_map['KEEP']

Source File: process_PeerRead_abstracts.py From causal-text-embeddings with MIT License

6 votes

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
    parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
    parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/proc')
    parser.add_argument('--out-file', type=str, default='arxiv-all.tf_record')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    parser.add_argument('--max-abs-len', type=int, default=250)
    parser.add_argument('--venue', type=int, default=0)
    parser.add_argument('--year', type=int, default=2017)


    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    clean_PeerRead_dataset(args.review_json_dir, args.parsedpdf_json_dir,
                           args.venue, args.year,
                           args.out_dir, args.out_file,
                           args.max_abs_len, tokenizer, is_arxiv=True)

Source File: process_reddit.py From causal-text-embeddings with MIT License

6 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir', type=str, default=None)
    parser.add_argument('--out-dir', type=str, default='../dat/reddit')
    parser.add_argument('--out-file', type=str, default='proc.tf_record')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    parser.add_argument('--max-abs-len', type=int, default=128)
    parser.add_argument('--subsample', type=int, default=0)
    parser.add_argument('--use-latest-reddit', type=bool, default=True)

    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    process_reddit_dataset(args.data_dir, args.out_dir, args.out_file,
                           args.max_abs_len, tokenizer, args.subsample, args.use_latest_reddit)

Source File: train_decoder_layer.py From sqlova with Apache License 2.0

6 votes

def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):


    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')



    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config

Source File: train.py From sqlova with Apache License 2.0

6 votes

def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config

Source File: train_shallow_layer.py From sqlova with Apache License 2.0

6 votes

def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):


    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')



    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config

Source File: sent_eval.py From embedding with MIT License

6 votes

def __init__(self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
                 bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
                 vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
                 max_seq_length=32, dimension=768, num_labels=2, use_notebook=False):

        super().__init__("bert", dimension, use_notebook)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(config,
                                                                                                    max_seq_length,
                                                                                                    1.0,
                                                                                                    num_labels,
                                                                                                    tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path)

Source File: preprocess_qa.py From language with Apache License 2.0

5 votes

def main(_):
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  examples = read_examples(input_file=FLAGS.input_file)

  # Pre-shuffle the input to avoid having to make a very large shuffle
  # buffer in in the `input_fn`.
  rng = random.Random(12345)
  rng.shuffle(examples)

  # We write to a temporary file to avoid storing very large
  # constant tensors in memory.
  writer = FeatureWriter(filename=FLAGS.output_file)
  features = []

  def append_feature(feature):
    features.append(feature)
    writer.process_feature(feature)

  convert_examples_to_features(
      examples=examples,
      tokenizer=tokenizer,
      max_doc_length=FLAGS.max_seq_length,
      doc_stride=FLAGS.doc_stride,
      max_query_length=FLAGS.max_query_length,
      output_fn=append_feature)
  writer.close()
  tf.logging.info("%d original examples read.", len(examples))
  tf.logging.info("%d split records written.", writer.num_features)

  if FLAGS.feature_file is not None:
    json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
              tf.gfile.Open(FLAGS.feature_file, "w"))

Source File: run_classifier_with_tfhub.py From QGforQA with MIT License

5 votes

def create_tokenizer_from_hub_module(bert_hub_module_handle):
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(bert_hub_module_handle)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
  return tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0

5 votes

def preproc_doc(document):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a CCNews article (ie. a list of sentences)

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # document = [
  #     tokenization.convert_to_unicode(
  #         unidecode.unidecode(line.decode("utf-8"))) for line in document
  # ]

  sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
  sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
  if len(sent_tokens) < 8:
    return []

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
                                              FLAGS.max_sent_length,
                                              FLAGS.max_para_length)

  # Serialize TFExample for writing to file.
  tf_examples = [tf_example.SerializeToString()]

  return tf_examples

Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0

5 votes

def preproc_doc(document):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a CCNews article (ie. a list of sentences)

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  document = [
      tokenization.convert_to_unicode(
          unidecode.unidecode(line.decode("utf-8"))) for line in document
  ]

  sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
  sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
  if len(sent_tokens) < 8:
    return []

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
                                              FLAGS.max_sent_length,
                                              FLAGS.max_para_length)

  # Serialize TFExample for writing to file.
  tf_examples = [tf_example.SerializeToString()]

  return tf_examples

Source File: convert_to_examples.py From language with Apache License 2.0

5 votes

def main(unused_argv):
  tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary)

  print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' +
        FLAGS.input_filepath)

  # The debugging file saves all of the processed SQL queries.
  debugging_file = gfile.Open(
      os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]),
                   FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'),
      'w')

  # The output file will save a sequence of string-serialized JSON objects, one
  # line per object.
  output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w')

  if FLAGS.dataset_name.lower() == 'spider':
    num_examples_created, num_examples_failed = process_spider(
        output_file, debugging_file, tokenizer)
  elif FLAGS.dataset_name.lower() == 'wikisql':
    num_examples_created, num_examples_failed = process_wikisql(
        output_file, debugging_file, tokenizer)
  else:
    num_examples_created, num_examples_failed = process_michigan_datasets(
        output_file, debugging_file, tokenizer)

  print('Wrote %s examples, could not annotate %s examples.' %
        (num_examples_created, num_examples_failed))
  debugging_file.write('Wrote %s examples, could not annotate %s examples.' %
                       (num_examples_created, num_examples_failed))
  debugging_file.close()
  output_file.close()

Source File: create_tfrecords.py From language with Apache License 2.0

5 votes

def main(_):
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  examples = read_examples(input_file=FLAGS.input_file)

  # Pre-shuffle the input to avoid having to make a very large shuffle
  # buffer in in the `input_fn`.
  rng = random.Random(12345)
  rng.shuffle(examples)

  # We write to a temporary file to avoid storing very large
  # constant tensors in memory.
  writer = FeatureWriter(filename=FLAGS.output_file)
  features = []

  def append_feature(feature):
    features.append(feature)
    writer.process_feature(feature)

  convert_examples_to_features(
      examples=examples,
      tokenizer=tokenizer,
      max_doc_length=FLAGS.max_seq_length,
      doc_stride=FLAGS.doc_stride,
      max_query_length=FLAGS.max_query_length,
      output_fn=append_feature)
  writer.close()
  tf.logging.info("%d original examples read.", len(examples))
  tf.logging.info("%d split records written.", writer.num_features)

  if FLAGS.feature_file is not None:
    json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
              tf.gfile.Open(FLAGS.feature_file, "w"))

Source File: create_pretraining_data.py From QGforQA with MIT License

5 votes

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)

Source File: run_nq.py From language with Apache License 2.0

5 votes

def __init__(self, is_training):
    self.is_training = is_training
    self.tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

Source File: preprocess_bert_dataset.py From delta with Apache License 2.0

5 votes

def bert_preprocess(filename, vocab):
  tokenizer = tokenization.FullTokenizer(
    vocab_file=vocab, do_lower_case=False)
  new_filename = filename + ".bert"
  f1 = open(new_filename, 'w')
  per_count = 0
  with open(filename, "r") as f:
    lines = f.readlines()
    for line in lines:
      str1 = line.split("\t")[1]
      label1 = line.split("\t")[0]
      new_label_list = []
      old_label_list = label1.split(' ')
      word_list = str1.split(' ')
      tokens = []
      tokens.append('[CLS]')
      new_label_list.append('O')
      per_count = 0
      for i, (w, t) in enumerate(zip(word_list, old_label_list)):
        token = tokenizer.tokenize(w)
        tokens.extend(token)
        for i, _ in enumerate(token):
          if i == 0:
            new_label_list.append(t)
          else:
            new_label_list.append("X")
      tokens.append('[SEG]')
      new_label_list.append('O')
      assert len(tokens) == len(new_label_list)
      rm_new_label_list = [i for i in new_label_list if i != 'O' and i != 'X']
      rm_old_label_list = [i for i in old_label_list if i != 'O' and i != 'X']
      assert len(rm_new_label_list) == len(rm_old_label_list)
      f1.write(" ".join(new_label_list) + '\t' +
               " ".join(tokens) + '\n')

Source File: bert_sim.py From chinese-bert-similarity with MIT License

5 votes

def __init__(self, gpu_no, log_dir, bert_sim_dir, verbose=False):
        self.bert_sim_dir = bert_sim_dir
        self.logger = set_logger(colored('BS', 'cyan'), log_dir, verbose)

        self.tf = import_tf(gpu_no, verbose)

        # add tokenizer
        from bert import tokenization
        self.tokenizer = tokenization.FullTokenizer(os.path.join(bert_sim_dir, 'vocab.txt'))
        # add placeholder
        self.input_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_ids')
        self.input_mask = self.tf.placeholder(self.tf.int32, (None, 45), 'input_mask')
        self.input_type_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_type_ids')
        # init graph
        self._init_graph()

Source File: bert_predict.py From FoolNLTK with Apache License 2.0

5 votes

def __init__(self, export_model_path, vocab_file):
    self.export_model_path = export_model_path
    self.vocab_file = vocab_file
    self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    self.predict_fn = predictor.from_saved_model(self.export_model_path)
    self.label_map = pickle.load(open(LABEL_FILE, 'rb'))
    self.id_to_label = {v: k for k, v in self.label_map.items()}

Source File: minimize.py From coref with Apache License 2.0

5 votes

def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case):
  # do_lower_case = True if 'chinese' in vocab_file else False
  tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)
  minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)

Source File: overlap_minimize.py From coref with Apache License 2.0

5 votes

def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case):
  tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)
  minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)

Source File: run_similarity.py From KBQA-BERT with MIT License

5 votes

def __init__(self, batch_size=args.batch_size):
        self.mode = None
        self.max_seq_length = args.max_seq_len
        self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
        self.batch_size = batch_size
        self.estimator = None
        self.processor = SimProcessor()
        tf.logging.set_verbosity(tf.logging.INFO)

Source File: strings_utils.py From ludwig with Apache License 2.0

5 votes

def __init__(self, vocab_file=None, **kwargs):
        super().__init__()
        if vocab_file is None:
            raise ValueError(
                'Vocabulary file is required to initialize BERT tokenizer'
            )

        try:
            from bert.tokenization import FullTokenizer
        except ImportError:
            raise ValueError(
                "Please install bert-tensorflow: pip install bert-tensorflow"
            )

        self.tokenizer = FullTokenizer(vocab_file)

Source File: create_pretraining_data.py From causal-text-embeddings with MIT License

5 votes

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)

Source File: array_from_dataset.py From causal-text-embeddings with MIT License

5 votes

def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
                            base_output_dir='../dat/sim/peerread_buzzytitle_based/'):

    labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)

    num_splits = 10
    dev_splits = [0]
    test_splits = [0]

    # data_file = '../dat/reddit/proc.tf_record'
    # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

    input_dataset_from_filenames = make_input_fn_from_file(data_file,
                                                           250,
                                                           num_splits,
                                                           dev_splits,
                                                           test_splits,
                                                           tokenizer,
                                                           is_training=False,
                                                           filter_test=False,
                                                           shuffle_buffer_size=25000,
                                                           seed=seed,
                                                           labeler=labeler)

    output_df = dataset_fn_to_df(input_dataset_from_filenames)
    output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})

    output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))

    output_df.to_csv(output_path, '\t')

Source File: clean_PeerRead.py From causal-text-embeddings with MIT License

5 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    args = parser.parse_args()

    datasets_dir = args.datasets_dir
    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    def proc_dataset(dataset):
        all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
        review_json_dir = os.path.join(all_dir, 'reviews')
        parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')

        venue = dataset_venues[dataset]
        year = dataset_years[dataset]

        out_dir = os.path.join(datasets_dir, 'proc')
        out_file = dataset + '.tf_record'
        max_abs_len = 250

        clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
                               tokenizer)

    # pool = mp.Pool(4)
    # pool.map(proc_dataset, dataset_names)

    for dataset in dataset_names:
        proc_dataset(dataset)

Source File: extra_vocab.py From causal-text-embeddings with MIT License

5 votes

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--review-json-dir', type=str, default=None)
    parser.add_argument('--vocab-file', type=str, default=None)

    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    review_json_dir = args.review_json_dir

    print('Reading reviews from...', review_json_dir)
    paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))

    paper_json_filename = paper_json_filenames[0]
    with io.open(paper_json_filename) as json_file:
        loaded = json.load(json_file)
    abstract = loaded['abstract']
    print(abstract)
    tokens = tokenizer.tokenize(abstract)
    print(tokens)
    print(tokenizer.convert_tokens_to_ids(tokens))

    # for idx, paper_json_filename in enumerate(paper_json_filenames):
    #     with io.open(paper_json_filename) as json_file:
    #         loaded = json.load(json_file)
    #
    #     print(loaded['abstract'])

Source File: tune_utils.py From embedding with MIT License

5 votes

def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
                 test_corpus_fname=None, tokenized_test_corpus_fname=None,
                 model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
                 batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
                 sp_model_path=None):
        # configurations
        tf.logging.set_verbosity(tf.logging.INFO)
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)
        # define tokenizer
        if self.model_name == "bert":
            self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        elif self.model_name == "xlnet":
            sp = spm.SentencePieceProcessor()
            sp.Load(sp_model_path)
            self.tokenizer = sp
        else:
            self.tokenizer = get_tokenizer("mecab")
        # load or tokenize corpus
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)

Python bert.tokenization.FullTokenizer() Examples