Python bert.tokenization.FullTokenizer() Examples
The following are 30
code examples of bert.tokenization.FullTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bert.tokenization
, or try the search function
.
Example #1
Source File: tokenization_test.py From QGforQA with MIT License | 6 votes |
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join( [x + "\n" for x in vocab_tokens]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
Example #2
Source File: server.py From Bert-TextClassification with MIT License | 6 votes |
def __init__(self, id, args, worker_address, sink_address): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer ) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config)) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id) self.worker_address = worker_address self.sink_address = sink_address
Example #3
Source File: tokenization_test.py From Bert-TextClassification with MIT License | 6 votes |
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
Example #4
Source File: bert_example.py From lasertagger with Apache License 2.0 | 6 votes |
def __init__(self, label_map, vocab_file, max_seq_length, do_lower_case, converter): """Initializes an instance of BertExampleBuilder. Args: label_map: Mapping from tags to tag IDs. vocab_file: Path to BERT vocabulary file. max_seq_length: Maximum sequence length. do_lower_case: Whether to lower case the input text. Should be True for uncased models and False for cased models. converter: Converter from text targets to tags. """ self._label_map = label_map self._tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=do_lower_case) self._max_seq_length = max_seq_length self._converter = converter self._pad_id = self._get_pad_id() self._keep_tag_id = self._label_map['KEEP']
Example #5
Source File: process_PeerRead_abstracts.py From causal-text-embeddings with MIT License | 6 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews') parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs') parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/proc') parser.add_argument('--out-file', type=str, default='arxiv-all.tf_record') parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') parser.add_argument('--max-abs-len', type=int, default=250) parser.add_argument('--venue', type=int, default=0) parser.add_argument('--year', type=int, default=2017) args = parser.parse_args() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=True) clean_PeerRead_dataset(args.review_json_dir, args.parsedpdf_json_dir, args.venue, args.year, args.out_dir, args.out_file, args.max_abs_len, tokenizer, is_arxiv=True)
Example #6
Source File: process_reddit.py From causal-text-embeddings with MIT License | 6 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', type=str, default=None) parser.add_argument('--out-dir', type=str, default='../dat/reddit') parser.add_argument('--out-file', type=str, default='proc.tf_record') parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') parser.add_argument('--max-abs-len', type=int, default=128) parser.add_argument('--subsample', type=int, default=0) parser.add_argument('--use-latest-reddit', type=bool, default=True) args = parser.parse_args() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=True) process_reddit_dataset(args.data_dir, args.out_dir, args.out_file, args.max_abs_len, tokenizer, args.subsample, args.use_latest_reddit)
Example #7
Source File: train_decoder_layer.py From sqlova with Apache License 2.0 | 6 votes |
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) if no_pretraining: pass else: model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
Example #8
Source File: train.py From sqlova with Apache License 2.0 | 6 votes |
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) if no_pretraining: pass else: model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
Example #9
Source File: train_shallow_layer.py From sqlova with Apache License 2.0 | 6 votes |
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) bert_config.print_status() model_bert = BertModel(bert_config) if no_pretraining: pass else: model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) print("Load pre-trained parameters.") model_bert.to(device) return model_bert, tokenizer, bert_config
Example #10
Source File: sent_eval.py From embedding with MIT License | 6 votes |
def __init__(self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt", bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt", max_seq_length=32, dimension=768, num_labels=2, use_notebook=False): super().__init__("bert", dimension, use_notebook) config = BertConfig.from_json_file(bertconfig_fname) self.max_seq_length = max_seq_length self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(config, max_seq_length, 1.0, num_labels, tune=False) saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(model_fname) saver.restore(self.sess, checkpoint_path)
Example #11
Source File: preprocess_qa.py From language with Apache License 2.0 | 5 votes |
def main(_): tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) examples = read_examples(input_file=FLAGS.input_file) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(examples) # We write to a temporary file to avoid storing very large # constant tensors in memory. writer = FeatureWriter(filename=FLAGS.output_file) features = [] def append_feature(feature): features.append(feature) writer.process_feature(feature) convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_doc_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, output_fn=append_feature) writer.close() tf.logging.info("%d original examples read.", len(examples)) tf.logging.info("%d split records written.", writer.num_features) if FLAGS.feature_file is not None: json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]], tf.gfile.Open(FLAGS.feature_file, "w"))
Example #12
Source File: run_classifier_with_tfhub.py From QGforQA with MIT License | 5 votes |
def create_tokenizer_from_hub_module(bert_hub_module_handle): """Get the vocab file and casing info from the Hub module.""" with tf.Graph().as_default(): bert_module = hub.Module(bert_hub_module_handle) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) return tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case)
Example #13
Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def preproc_doc(document): """Convert document to list of TF Examples for binary order classification. Args: document: a CCNews article (ie. a list of sentences) Returns: A list of tfexamples of binary orderings of pairs of sentences in the document. The tfexamples are serialized to string to be written directly to TFRecord. """ tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # document = [ # tokenization.convert_to_unicode( # unidecode.unidecode(line.decode("utf-8"))) for line in document # ] sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent] sent_tokens = [sent for sent in sent_tokens if len(sent) > 1] if len(sent_tokens) < 8: return [] # Convert token lists into ids and add any needed tokens and padding for BERT tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens, FLAGS.max_sent_length, FLAGS.max_para_length) # Serialize TFExample for writing to file. tf_examples = [tf_example.SerializeToString()] return tf_examples
Example #14
Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def preproc_doc(document): """Convert document to list of TF Examples for binary order classification. Args: document: a CCNews article (ie. a list of sentences) Returns: A list of tfexamples of binary orderings of pairs of sentences in the document. The tfexamples are serialized to string to be written directly to TFRecord. """ tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) document = [ tokenization.convert_to_unicode( unidecode.unidecode(line.decode("utf-8"))) for line in document ] sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent] sent_tokens = [sent for sent in sent_tokens if len(sent) > 1] if len(sent_tokens) < 8: return [] # Convert token lists into ids and add any needed tokens and padding for BERT tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens, FLAGS.max_sent_length, FLAGS.max_para_length) # Serialize TFExample for writing to file. tf_examples = [tf_example.SerializeToString()] return tf_examples
Example #15
Source File: convert_to_examples.py From language with Apache License 2.0 | 5 votes |
def main(unused_argv): tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary) print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' + FLAGS.input_filepath) # The debugging file saves all of the processed SQL queries. debugging_file = gfile.Open( os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]), FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'), 'w') # The output file will save a sequence of string-serialized JSON objects, one # line per object. output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w') if FLAGS.dataset_name.lower() == 'spider': num_examples_created, num_examples_failed = process_spider( output_file, debugging_file, tokenizer) elif FLAGS.dataset_name.lower() == 'wikisql': num_examples_created, num_examples_failed = process_wikisql( output_file, debugging_file, tokenizer) else: num_examples_created, num_examples_failed = process_michigan_datasets( output_file, debugging_file, tokenizer) print('Wrote %s examples, could not annotate %s examples.' % (num_examples_created, num_examples_failed)) debugging_file.write('Wrote %s examples, could not annotate %s examples.' % (num_examples_created, num_examples_failed)) debugging_file.close() output_file.close()
Example #16
Source File: create_tfrecords.py From language with Apache License 2.0 | 5 votes |
def main(_): tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) examples = read_examples(input_file=FLAGS.input_file) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(examples) # We write to a temporary file to avoid storing very large # constant tensors in memory. writer = FeatureWriter(filename=FLAGS.output_file) features = [] def append_feature(feature): features.append(feature) writer.process_feature(feature) convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_doc_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, output_fn=append_feature) writer.close() tf.logging.info("%d original examples read.", len(examples)) tf.logging.info("%d split records written.", writer.num_features) if FLAGS.feature_file is not None: json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]], tf.gfile.Open(FLAGS.feature_file, "w"))
Example #17
Source File: create_pretraining_data.py From QGforQA with MIT License | 5 votes |
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)
Example #18
Source File: run_nq.py From language with Apache License 2.0 | 5 votes |
def __init__(self, is_training): self.is_training = is_training self.tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
Example #19
Source File: preprocess_bert_dataset.py From delta with Apache License 2.0 | 5 votes |
def bert_preprocess(filename, vocab): tokenizer = tokenization.FullTokenizer( vocab_file=vocab, do_lower_case=False) new_filename = filename + ".bert" f1 = open(new_filename, 'w') per_count = 0 with open(filename, "r") as f: lines = f.readlines() for line in lines: str1 = line.split("\t")[1] label1 = line.split("\t")[0] new_label_list = [] old_label_list = label1.split(' ') word_list = str1.split(' ') tokens = [] tokens.append('[CLS]') new_label_list.append('O') per_count = 0 for i, (w, t) in enumerate(zip(word_list, old_label_list)): token = tokenizer.tokenize(w) tokens.extend(token) for i, _ in enumerate(token): if i == 0: new_label_list.append(t) else: new_label_list.append("X") tokens.append('[SEG]') new_label_list.append('O') assert len(tokens) == len(new_label_list) rm_new_label_list = [i for i in new_label_list if i != 'O' and i != 'X'] rm_old_label_list = [i for i in old_label_list if i != 'O' and i != 'X'] assert len(rm_new_label_list) == len(rm_old_label_list) f1.write(" ".join(new_label_list) + '\t' + " ".join(tokens) + '\n')
Example #20
Source File: bert_sim.py From chinese-bert-similarity with MIT License | 5 votes |
def __init__(self, gpu_no, log_dir, bert_sim_dir, verbose=False): self.bert_sim_dir = bert_sim_dir self.logger = set_logger(colored('BS', 'cyan'), log_dir, verbose) self.tf = import_tf(gpu_no, verbose) # add tokenizer from bert import tokenization self.tokenizer = tokenization.FullTokenizer(os.path.join(bert_sim_dir, 'vocab.txt')) # add placeholder self.input_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, 45), 'input_mask') self.input_type_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_type_ids') # init graph self._init_graph()
Example #21
Source File: bert_predict.py From FoolNLTK with Apache License 2.0 | 5 votes |
def __init__(self, export_model_path, vocab_file): self.export_model_path = export_model_path self.vocab_file = vocab_file self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) self.predict_fn = predictor.from_saved_model(self.export_model_path) self.label_map = pickle.load(open(LABEL_FILE, 'rb')) self.id_to_label = {v: k for k, v in self.label_map.items()}
Example #22
Source File: minimize.py From coref with Apache License 2.0 | 5 votes |
def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case): # do_lower_case = True if 'chinese' in vocab_file else False tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
Example #23
Source File: overlap_minimize.py From coref with Apache License 2.0 | 5 votes |
def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case): tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
Example #24
Source File: run_similarity.py From KBQA-BERT with MIT License | 5 votes |
def __init__(self, batch_size=args.batch_size): self.mode = None self.max_seq_length = args.max_seq_len self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) self.batch_size = batch_size self.estimator = None self.processor = SimProcessor() tf.logging.set_verbosity(tf.logging.INFO)
Example #25
Source File: strings_utils.py From ludwig with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file=None, **kwargs): super().__init__() if vocab_file is None: raise ValueError( 'Vocabulary file is required to initialize BERT tokenizer' ) try: from bert.tokenization import FullTokenizer except ImportError: raise ValueError( "Please install bert-tensorflow: pip install bert-tensorflow" ) self.tokenizer = FullTokenizer(vocab_file)
Example #26
Source File: create_pretraining_data.py From causal-text-embeddings with MIT License | 5 votes |
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)
Example #27
Source File: array_from_dataset.py From causal-text-embeddings with MIT License | 5 votes |
def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0, base_output_dir='../dat/sim/peerread_buzzytitle_based/'): labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed) num_splits = 10 dev_splits = [0] test_splits = [0] # data_file = '../dat/reddit/proc.tf_record' # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) input_dataset_from_filenames = make_input_fn_from_file(data_file, 250, num_splits, dev_splits, test_splits, tokenizer, is_training=False, filter_test=False, shuffle_buffer_size=25000, seed=seed, labeler=labeler) output_df = dataset_fn_to_df(input_dataset_from_filenames) output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'}) output_dir = os.path.join(base_output_dir, "mode{}".format(setting)) os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level)) output_df.to_csv(output_path, '\t')
Example #28
Source File: clean_PeerRead.py From causal-text-embeddings with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead') parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt') args = parser.parse_args() datasets_dir = args.datasets_dir tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=True) def proc_dataset(dataset): all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all') review_json_dir = os.path.join(all_dir, 'reviews') parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs') venue = dataset_venues[dataset] year = dataset_years[dataset] out_dir = os.path.join(datasets_dir, 'proc') out_file = dataset + '.tf_record' max_abs_len = 250 clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len, tokenizer) # pool = mp.Pool(4) # pool.map(proc_dataset, dataset_names) for dataset in dataset_names: proc_dataset(dataset)
Example #29
Source File: extra_vocab.py From causal-text-embeddings with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--review-json-dir', type=str, default=None) parser.add_argument('--vocab-file', type=str, default=None) args = parser.parse_args() tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=True) review_json_dir = args.review_json_dir print('Reading reviews from...', review_json_dir) paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir))) paper_json_filename = paper_json_filenames[0] with io.open(paper_json_filename) as json_file: loaded = json.load(json_file) abstract = loaded['abstract'] print(abstract) tokens = tokenizer.tokenize(abstract) print(tokens) print(tokenizer.convert_tokens_to_ids(tokens)) # for idx, paper_json_filename in enumerate(paper_json_filenames): # with io.open(paper_json_filename) as json_file: # loaded = json.load(json_file) # # print(loaded['abstract'])
Example #30
Source File: tune_utils.py From embedding with MIT License | 5 votes |
def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None, test_corpus_fname=None, tokenized_test_corpus_fname=None, model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None, sp_model_path=None): # configurations tf.logging.set_verbosity(tf.logging.INFO) self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 if not os.path.exists(model_save_path): os.mkdir(model_save_path) # define tokenizer if self.model_name == "bert": self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) elif self.model_name == "xlnet": sp = spm.SentencePieceProcessor() sp.Load(sp_model_path) self.tokenizer = sp else: self.tokenizer = get_tokenizer("mecab") # load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)