Python pytorch_transformers.berttokenizer.from_pretrained() Examples
The following are 19
code examples of pytorch_transformers.berttokenizer.from_pretrained().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pytorch_transformers.BertTokenizer
, or try the search function
.
![](https://www.programcreek.com/common/static/images/search.png)
Example #1
Source File: hf_bert_bpe.py From fairseq with MIT License | 7 votes |
def __init__(self, args): try: from pytorch_transformers import BertTokenizer from pytorch_transformers.tokenization_utils import clean_up_tokenization except ImportError: raise ImportError( 'Please install 1.0.0 version of pytorch_transformers' 'with: pip install pytorch-transformers' ) if 'bpe_vocab_file' in args: self.bert_tokenizer = BertTokenizer( args.bpe_vocab_file, do_lower_case=not args.bpe_cased ) else: vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased' self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name) self.clean_up_tokenization = clean_up_tokenization
Example #2
Source File: explainer.py From fine-grained-sentiment with MIT License | 6 votes |
def __init__(self, model_file: str=None) -> None: "Requires the BertTokenizer from pytorch_transformers" # pip install pytorch_transformers import os import torch from pytorch_transformers import BertTokenizer, cached_path from training.transformer_utils.model import TransformerWithClfHeadAndAdapters try: self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.config = torch.load(cached_path(os.path.join(model_file, "model_training_args.bin"))) self.model = TransformerWithClfHeadAndAdapters(self.config["config"], self.config["config_ft"]).to(self.device) state_dict = torch.load(cached_path(os.path.join(model_file, "model_weights.pth")), map_location=self.device) self.model.load_state_dict(state_dict) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) except: raise Exception("Require a valid transformer model file ({0}/model_weights.pth) " "and its config file ({0}/model_training_args.bin)." .format(model_file))
Example #3
Source File: classifiers.py From fine-grained-sentiment with MIT License | 6 votes |
def __init__(self, model_path: str=None) -> None: super().__init__() "Requires the BertTokenizer from pytorch_transformers" # pip install pytorch_transformers import os import torch from pytorch_transformers import BertTokenizer, cached_path from training.transformer_utils.model import TransformerWithClfHeadAndAdapters try: self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.config = torch.load(cached_path(os.path.join(model_path, "model_training_args.bin"))) self.model = TransformerWithClfHeadAndAdapters(self.config["config"], self.config["config_ft"]).to(self.device) state_dict = torch.load(cached_path(os.path.join(model_path, "model_weights.pth")), map_location=self.device) self.model.load_state_dict(state_dict) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) except: raise Exception("Require a valid transformer model file ({0}/model_weights.pth) " "and its config file ({0}/model_training_args.bin)." .format(model_path))
Example #4
Source File: hf_bert_bpe.py From attn2d with MIT License | 6 votes |
def __init__(self, args): try: from pytorch_transformers import BertTokenizer from pytorch_transformers.tokenization_utils import clean_up_tokenization except ImportError: raise ImportError( 'Please install 1.0.0 version of pytorch_transformers' 'with: pip install pytorch-transformers' ) if 'bpe_vocab_file' in args: self.bert_tokenizer = BertTokenizer( args.bpe_vocab_file, do_lower_case=not args.bpe_cased ) else: vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased' self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name) self.clean_up_tokenization = clean_up_tokenization
Example #5
Source File: prosody_dataset.py From prosody with MIT License | 5 votes |
def __init__(self, tagged_sents, tag_to_index, config, word_to_embid=None): sents, tags_li,values_li = [], [], [] # list of lists self.config = config for sent in tagged_sents: words = [word_tag[0] for word_tag in sent] tags = [word_tag[1] for word_tag in sent] values = [word_tag[3] for word_tag in sent] #+++HANDE if self.config.model != 'LSTM' and self.config.model != 'BiLSTM': sents.append(["[CLS]"] + words + ["[SEP]"]) tags_li.append(["<pad>"] + tags + ["<pad>"]) values_li.append(["<pad>"] + values + ["<pad>"]) else: sents.append(words) tags_li.append(tags) values_li.append(values) self.sents, self.tags_li, self.values_li = sents, tags_li, values_li if self.config.model == 'BertUncased': self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) else: self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) self.tag_to_index = tag_to_index self.word_to_embid = word_to_embid
Example #6
Source File: text.py From End-to-end-ASR-Pytorch with MIT License | 5 votes |
def load_from_file(cls, vocab_file): from pytorch_transformers import BertTokenizer return cls(BertTokenizer.from_pretrained(vocab_file))
Example #7
Source File: ssss.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) self.bert = RobertaForSequenceClassification.from_pretrained(pretrain_path, num_labels=2) #self.bert = RobertaModel.from_pretrained(pretrain_path) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.modelName = 'Roberta'
Example #8
Source File: ssss.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) # self.bert = BertModel.from_pretrained(pretrain_path) self.bert = BertForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(os.path.join( pretrain_path, 'bert_vocab.txt')) self.modelName = 'Bert'
Example #9
Source File: bert_vocab.py From OpenNIR with MIT License | 5 votes |
def __init__(self, vocabulary): super().__init__(vocabulary) layer = vocabulary.config['layer'] if layer == -1: layer = None bert_model = bert_models.get_model(vocabulary.config['bert_base'], vocabulary.logger) self.bert = CustomBertModelWrapper.from_pretrained(bert_model, depth=layer) if vocabulary.config['bert_weights']: weight_path = os.path.join(util.path_vocab(vocabulary), vocabulary.config['bert_weights']) with vocabulary.logger.duration('loading BERT weights from {}'.format(weight_path)): self.bert.load_state_dict(torch.load(weight_path), strict=False) self.CLS = vocabulary.tok2id('[CLS]') self.SEP = vocabulary.tok2id('[SEP]') self.bert.set_trainable(vocabulary.config['train'])
Example #10
Source File: bert_vocab.py From OpenNIR with MIT License | 5 votes |
def __init__(self, config, logger): super().__init__(config, logger) bert_model = bert_models.get_model(config['bert_base'], self.logger) self.tokenizer = BertTokenizer.from_pretrained(bert_model) # HACK! Until the transformers library adopts tokenizers, save and re-load vocab with tempfile.TemporaryDirectory() as d: self.tokenizer.save_vocabulary(d) # this tokenizer is ~4x faster as the BertTokenizer, per my measurements self.tokenizer = tk.BertWordPieceTokenizer(os.path.join(d, 'vocab.txt'))
Example #11
Source File: similarity.py From semantic-text-similarity with MIT License | 5 votes |
def __init__(self, args= None, device='cuda', bert_model_path='bert-base-uncased', batch_size=10, learning_rate = 5e-5, weight_decay=0, additional_features=None): if args is not None: self.args = vars(args) assert device in ['cuda', 'cpu'] if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path']) if os.path.exists(self.args['bert_model_path']): if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError("Cannot find a configuration for the BERT model you are attempting to load.") self.loss_function = torch.nn.MSELoss() config.pretrained_config_archive_map['additional_features'] = additional_features self.regressor_net = BertSimilarityRegressor.from_pretrained(self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.regressor_net.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'] ) self.log.info('Initialized BertSentencePairSimilarity model from %s' % self.args['bert_model_path'])
Example #12
Source File: baseline_models.py From comparatively-finetuning-bert with MIT License | 5 votes |
def __init__(self, pretrained_model_name_for_tokenizer, max_vocabulary_size, max_tokenization_length, embedding_dim, num_classes=1, num_recurrent_layers=1, use_bidirectional=False, hidden_size=128, dropout_rate=0.10, use_gpu=False): super(SimpleRNN, self).__init__() self.num_recurrent_layers = num_recurrent_layers self.use_bidirectional = use_bidirectional self.hidden_size = hidden_size self.use_gpu = use_gpu # Configure tokenizer self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_for_tokenizer) self.tokenizer.max_len = max_tokenization_length # Define additional layers & utilities specific to the finetuned task # Embedding Layer self.embedding = nn.Embedding(num_embeddings=max_vocabulary_size, embedding_dim=embedding_dim) # Dropout to prevent overfitting self.dropout = nn.Dropout(p=dropout_rate) # Recurrent Layer self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_recurrent_layers, bidirectional=use_bidirectional, batch_first=True) # Dense Layer for Classification self.clf = nn.Linear(in_features=hidden_size*2 if use_bidirectional else hidden_size, out_features=num_classes)
Example #13
Source File: train_abstractive.py From PreSumm with MIT License | 5 votes |
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
Example #14
Source File: train_abstractive.py From PreSumm with MIT License | 5 votes |
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
Example #15
Source File: train_abstractive.py From PreSumm with MIT License | 5 votes |
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
Example #16
Source File: bert_preprocessor.py From MatchZoo-py with Apache License 2.0 | 5 votes |
def __init__(self, mode: str = 'bert-base-uncased'): """Initialization.""" super().__init__() self._tokenizer = BertTokenizer.from_pretrained(mode)
Example #17
Source File: chinese_preprocess.py From LCF-ATEPC with MIT License | 4 votes |
def read_chinese(path): output_path = path + '_output.txt' content = path + '_sentence.txt' aspect = path + '_target.txt' polarity = path + '_label.txt' fin = open(content, 'r', encoding='utf-8', newline='\n', errors='ignore') reviews = fin.readlines() fin.close() for i in range(len(reviews)): reviews[i] = reviews[i].strip() fin = open(aspect, 'r', encoding='utf-8', newline='\n', errors='ignore') aspects = fin.readlines() fin.close() for i in range(len(aspects)): aspects[i] = aspects[i].strip() fin = open(polarity, 'r', encoding='utf-8', newline='\n', errors='ignore') polarities = fin.readlines() fin.close() for i in range(len(polarities)): polarities[i] = polarities[i].strip() from pytorch_transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert_for_global_context-base-chinese', do_lower_case=True) with open(output_path, 'w', encoding='utf-8', newline='\n', errors='ignore') as f_out: print(len(reviews)) print(len(aspects)) print(len(polarities)) for i in range(len(reviews)): if aspects[i] is '0': aspects[i] = reviews[i] if aspects[i].replace(' ','') not in reviews[i]: print(aspects[i].replace(' ','')) continue reviews[i]=reviews[i].replace(aspects[i].replace(' ',''),' $T$ ') f_out.write(' '.join(tokenizer.tokenize(reviews[i])) + '\n') f_out.write(' '.join(tokenizer.tokenize(aspects[i].replace(' ',''))) + '\n') if polarities[i].strip() is '0': f_out.write('1' + '\n') else: f_out.write('-1' + '\n') f_out.close()
Example #18
Source File: baseline_models.py From comparatively-finetuning-bert with MIT License | 4 votes |
def __init__(self, pretrained_model_name_for_embeddings, max_tokenization_length, num_classes=1, num_recurrent_layers=1, use_bidirectional=False, hidden_size=128, dropout_rate=0.10, use_gpu=False): super(SimpleRNNWithBERTEmbeddings, self).__init__() self.num_recurrent_layers = num_recurrent_layers self.use_bidirectional = use_bidirectional self.hidden_size = hidden_size self.use_gpu = use_gpu # Configure tokenizer self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_for_embeddings) self.tokenizer.max_len = max_tokenization_length # Define additional layers & utilities specific to the finetuned task # Embedding Layer # Get global BERT config self.config = BertConfig.from_pretrained(pretrained_model_name_for_embeddings) # Extract all parameters (weights and bias matrices) for the 12 layers all_states_dict = BertModel.from_pretrained(pretrained_model_name_for_embeddings, config=self.config).state_dict() # Get customized BERT config self.config.max_position_embeddings = max_tokenization_length self.config.num_hidden_layers = 0 self.config.output_hidden_states = True # Get pretrained BERT model & all its learnable parameters self.bert = BertModel.from_pretrained(pretrained_model_name_for_embeddings, config=self.config) current_states_dict = self.bert.state_dict() # Assign matching parameters (weights and biases of ONLY embeddings) for param in current_states_dict.keys(): if 'embedding' in param: current_states_dict[param] = all_states_dict[param] # Update parameters in extracted BERT model self.bert.load_state_dict(current_states_dict) logging.info('Loaded %d learnable parameters from pretrained BERT model with %d layer(s)' % (len(list(self.bert.parameters())), 0)) # Dropout to prevent overfitting self.dropout = nn.Dropout(p=dropout_rate) # Recurrent Layer self.lstm = nn.LSTM(input_size=self.config.hidden_size, hidden_size=hidden_size, num_layers=num_recurrent_layers, bidirectional=use_bidirectional, batch_first=True) # Dense Layer for Classification self.clf = nn.Linear(in_features=hidden_size * 2 if self.use_bidirectional else hidden_size, out_features=num_classes)
Example #19
Source File: train_abstractive.py From PreSumm with MIT License | 4 votes |
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None if (args.load_from_extractive != ''): logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)