Python pytorch_transformers.berttokenizer.from_pretrained() Examples

The following are 19 code examples of pytorch_transformers.berttokenizer.from_pretrained(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pytorch_transformers.BertTokenizer , or try the search function .
Example #1
Source File: hf_bert_bpe.py    From fairseq with MIT License 7 votes vote down vote up
def __init__(self, args):
        try:
            from pytorch_transformers import BertTokenizer
            from pytorch_transformers.tokenization_utils import clean_up_tokenization
        except ImportError:
            raise ImportError(
                'Please install 1.0.0 version of pytorch_transformers'
                'with: pip install pytorch-transformers'
            )

        if 'bpe_vocab_file' in args:
            self.bert_tokenizer = BertTokenizer(
                args.bpe_vocab_file,
                do_lower_case=not args.bpe_cased
            )
        else:
            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
            self.clean_up_tokenization = clean_up_tokenization 
Example #2
Source File: explainer.py    From fine-grained-sentiment with MIT License 6 votes vote down vote up
def __init__(self, model_file: str=None) -> None:
        "Requires the BertTokenizer from pytorch_transformers"
        # pip install pytorch_transformers
        import os
        import torch
        from pytorch_transformers import BertTokenizer, cached_path
        from training.transformer_utils.model import TransformerWithClfHeadAndAdapters
        try:
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            self.config = torch.load(cached_path(os.path.join(model_file, "model_training_args.bin")))
            self.model = TransformerWithClfHeadAndAdapters(self.config["config"],
                                                           self.config["config_ft"]).to(self.device)
            state_dict = torch.load(cached_path(os.path.join(model_file, "model_weights.pth")),
                                    map_location=self.device)
            self.model.load_state_dict(state_dict)
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
        except:
            raise Exception("Require a valid transformer model file ({0}/model_weights.pth) "
                            "and its config file ({0}/model_training_args.bin)."
                            .format(model_file)) 
Example #3
Source File: classifiers.py    From fine-grained-sentiment with MIT License 6 votes vote down vote up
def __init__(self, model_path: str=None) -> None:
        super().__init__()
        "Requires the BertTokenizer from pytorch_transformers"
        # pip install pytorch_transformers
        import os
        import torch
        from pytorch_transformers import BertTokenizer, cached_path
        from training.transformer_utils.model import TransformerWithClfHeadAndAdapters
        try:
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            self.config = torch.load(cached_path(os.path.join(model_path, "model_training_args.bin")))
            self.model = TransformerWithClfHeadAndAdapters(self.config["config"],
                                                           self.config["config_ft"]).to(self.device)
            state_dict = torch.load(cached_path(os.path.join(model_path, "model_weights.pth")),
                                    map_location=self.device)
            self.model.load_state_dict(state_dict)
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
        except:
            raise Exception("Require a valid transformer model file ({0}/model_weights.pth) "
                            "and its config file ({0}/model_training_args.bin)."
                            .format(model_path)) 
Example #4
Source File: hf_bert_bpe.py    From attn2d with MIT License 6 votes vote down vote up
def __init__(self, args):
        try:
            from pytorch_transformers import BertTokenizer
            from pytorch_transformers.tokenization_utils import clean_up_tokenization
        except ImportError:
            raise ImportError(
                'Please install 1.0.0 version of pytorch_transformers'
                'with: pip install pytorch-transformers'
            )

        if 'bpe_vocab_file' in args:
            self.bert_tokenizer = BertTokenizer(
                args.bpe_vocab_file,
                do_lower_case=not args.bpe_cased
            )
        else:
            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
            self.clean_up_tokenization = clean_up_tokenization 
Example #5
Source File: prosody_dataset.py    From prosody with MIT License 5 votes vote down vote up
def __init__(self, tagged_sents, tag_to_index, config, word_to_embid=None):
        sents, tags_li,values_li = [], [], [] # list of lists
        self.config = config

        for sent in tagged_sents:
            words = [word_tag[0] for word_tag in sent]
            tags = [word_tag[1] for word_tag in sent]
            values = [word_tag[3] for word_tag in sent] #+++HANDE

            if self.config.model != 'LSTM' and self.config.model != 'BiLSTM':
                sents.append(["[CLS]"] + words + ["[SEP]"])
                tags_li.append(["<pad>"] + tags + ["<pad>"])
                values_li.append(["<pad>"] + values + ["<pad>"])
            else:
                sents.append(words)
                tags_li.append(tags)
                values_li.append(values)

        self.sents, self.tags_li, self.values_li = sents, tags_li, values_li
        if self.config.model == 'BertUncased':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        else:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

        self.tag_to_index = tag_to_index
        self.word_to_embid = word_to_embid 
Example #6
Source File: text.py    From End-to-end-ASR-Pytorch with MIT License 5 votes vote down vote up
def load_from_file(cls, vocab_file):
        from pytorch_transformers import BertTokenizer
        return cls(BertTokenizer.from_pretrained(vocab_file)) 
Example #7
Source File: ssss.py    From FewRel with MIT License 5 votes vote down vote up
def __init__(self, pretrain_path, max_length): 
        nn.Module.__init__(self)
        self.bert = RobertaForSequenceClassification.from_pretrained(pretrain_path, num_labels=2)
        #self.bert = RobertaModel.from_pretrained(pretrain_path)
        self.max_length = max_length
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.modelName = 'Roberta' 
Example #8
Source File: ssss.py    From FewRel with MIT License 5 votes vote down vote up
def __init__(self, pretrain_path, max_length): 
        nn.Module.__init__(self)
        # self.bert = BertModel.from_pretrained(pretrain_path)
        self.bert = BertForSequenceClassification.from_pretrained(
                pretrain_path,
                num_labels=2)
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained(os.path.join(
            pretrain_path, 'bert_vocab.txt'))
        self.modelName = 'Bert' 
Example #9
Source File: bert_vocab.py    From OpenNIR with MIT License 5 votes vote down vote up
def __init__(self, vocabulary):
        super().__init__(vocabulary)
        layer = vocabulary.config['layer']
        if layer == -1:
            layer = None
        bert_model = bert_models.get_model(vocabulary.config['bert_base'], vocabulary.logger)
        self.bert = CustomBertModelWrapper.from_pretrained(bert_model, depth=layer)
        if vocabulary.config['bert_weights']:
            weight_path = os.path.join(util.path_vocab(vocabulary), vocabulary.config['bert_weights'])
            with vocabulary.logger.duration('loading BERT weights from {}'.format(weight_path)):
                self.bert.load_state_dict(torch.load(weight_path), strict=False)
        self.CLS = vocabulary.tok2id('[CLS]')
        self.SEP = vocabulary.tok2id('[SEP]')
        self.bert.set_trainable(vocabulary.config['train']) 
Example #10
Source File: bert_vocab.py    From OpenNIR with MIT License 5 votes vote down vote up
def __init__(self, config, logger):
        super().__init__(config, logger)
        bert_model = bert_models.get_model(config['bert_base'], self.logger)
        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
        # HACK! Until the transformers library adopts tokenizers, save and re-load vocab
        with tempfile.TemporaryDirectory() as d:
            self.tokenizer.save_vocabulary(d)
            # this tokenizer is ~4x faster as the BertTokenizer, per my measurements
            self.tokenizer = tk.BertWordPieceTokenizer(os.path.join(d, 'vocab.txt')) 
Example #11
Source File: similarity.py    From semantic-text-similarity with MIT License 5 votes vote down vote up
def __init__(self, args= None, device='cuda', bert_model_path='bert-base-uncased', batch_size=10, learning_rate = 5e-5, weight_decay=0, additional_features=None):
        if args is not None:
            self.args = vars(args)

        assert device in ['cuda', 'cpu']

        if not args:
            self.args = {}
            self.args['bert_model_path'] = bert_model_path
            self.args['device'] = device
            self.args['learning_rate'] = learning_rate
            self.args['weight_decay'] = weight_decay
            self.args['batch_size'] = batch_size

        self.log = logging.getLogger()



        self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path'])
        if os.path.exists(self.args['bert_model_path']):
            if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)):
                config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME))
            elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')):
                config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json'))
            else:
                raise ValueError("Cannot find a configuration for the BERT model you are attempting to load.")

        self.loss_function = torch.nn.MSELoss()

        config.pretrained_config_archive_map['additional_features'] = additional_features

        self.regressor_net = BertSimilarityRegressor.from_pretrained(self.args['bert_model_path'], config=config)
        self.optimizer = torch.optim.Adam(
            self.regressor_net.parameters(),
            weight_decay=self.args['weight_decay'],
            lr=self.args['learning_rate']
        )
        self.log.info('Initialized BertSentencePairSimilarity model from %s' % self.args['bert_model_path']) 
Example #12
Source File: baseline_models.py    From comparatively-finetuning-bert with MIT License 5 votes vote down vote up
def __init__(self, pretrained_model_name_for_tokenizer, max_vocabulary_size,
                 max_tokenization_length, embedding_dim, num_classes=1, num_recurrent_layers=1,
                 use_bidirectional=False, hidden_size=128, dropout_rate=0.10, use_gpu=False):
        super(SimpleRNN, self).__init__()
        self.num_recurrent_layers = num_recurrent_layers
        self.use_bidirectional = use_bidirectional
        self.hidden_size = hidden_size
        self.use_gpu = use_gpu

        # Configure tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_for_tokenizer)
        self.tokenizer.max_len = max_tokenization_length

        # Define additional layers & utilities specific to the finetuned task
        # Embedding Layer
        self.embedding = nn.Embedding(num_embeddings=max_vocabulary_size,
                                      embedding_dim=embedding_dim)

        # Dropout to prevent overfitting
        self.dropout = nn.Dropout(p=dropout_rate)

        # Recurrent Layer
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=num_recurrent_layers,
                            bidirectional=use_bidirectional,
                            batch_first=True)

        # Dense Layer for Classification
        self.clf = nn.Linear(in_features=hidden_size*2 if use_bidirectional else hidden_size,
                             out_features=num_classes) 
Example #13
Source File: train_abstractive.py    From PreSumm with MIT License 5 votes vote down vote up
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step) 
Example #14
Source File: train_abstractive.py    From PreSumm with MIT License 5 votes vote down vote up
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step) 
Example #15
Source File: train_abstractive.py    From PreSumm with MIT License 5 votes vote down vote up
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent() 
Example #16
Source File: bert_preprocessor.py    From MatchZoo-py with Apache License 2.0 5 votes vote down vote up
def __init__(self, mode: str = 'bert-base-uncased'):
        """Initialization."""
        super().__init__()
        self._tokenizer = BertTokenizer.from_pretrained(mode) 
Example #17
Source File: chinese_preprocess.py    From LCF-ATEPC with MIT License 4 votes vote down vote up
def read_chinese(path):
        output_path = path + '_output.txt'
        content = path + '_sentence.txt'
        aspect = path + '_target.txt'
        polarity = path + '_label.txt'
        fin = open(content, 'r', encoding='utf-8', newline='\n', errors='ignore')
        reviews = fin.readlines()
        fin.close()
        for i in range(len(reviews)):
            reviews[i] = reviews[i].strip()

        fin = open(aspect, 'r', encoding='utf-8', newline='\n', errors='ignore')
        aspects = fin.readlines()
        fin.close()
        for i in range(len(aspects)):
            aspects[i] = aspects[i].strip()

        fin = open(polarity, 'r', encoding='utf-8', newline='\n', errors='ignore')
        polarities = fin.readlines()
        fin.close()
        for i in range(len(polarities)):
            polarities[i] = polarities[i].strip()

        from pytorch_transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained('bert_for_global_context-base-chinese', do_lower_case=True)

        with open(output_path, 'w', encoding='utf-8', newline='\n', errors='ignore') as f_out:
            print(len(reviews))
            print(len(aspects))
            print(len(polarities))

            for i in range(len(reviews)):

                if aspects[i] is '0':
                    aspects[i] = reviews[i]
                if aspects[i].replace(' ','') not in reviews[i]:
                    print(aspects[i].replace(' ',''))
                    continue

                reviews[i]=reviews[i].replace(aspects[i].replace(' ',''),' $T$ ')

                f_out.write(' '.join(tokenizer.tokenize(reviews[i])) + '\n')
                f_out.write(' '.join(tokenizer.tokenize(aspects[i].replace(' ',''))) + '\n')
                if polarities[i].strip() is '0':
                    f_out.write('1' + '\n')
                else:
                    f_out.write('-1' + '\n')

            f_out.close() 
Example #18
Source File: baseline_models.py    From comparatively-finetuning-bert with MIT License 4 votes vote down vote up
def __init__(self, pretrained_model_name_for_embeddings, max_tokenization_length,
                 num_classes=1, num_recurrent_layers=1, use_bidirectional=False,
                 hidden_size=128, dropout_rate=0.10, use_gpu=False):
        super(SimpleRNNWithBERTEmbeddings, self).__init__()
        self.num_recurrent_layers = num_recurrent_layers
        self.use_bidirectional = use_bidirectional
        self.hidden_size = hidden_size
        self.use_gpu = use_gpu

        # Configure tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_for_embeddings)
        self.tokenizer.max_len = max_tokenization_length

        # Define additional layers & utilities specific to the finetuned task
        # Embedding Layer
        # Get global BERT config
        self.config = BertConfig.from_pretrained(pretrained_model_name_for_embeddings)
        # Extract all parameters (weights and bias matrices) for the 12 layers
        all_states_dict = BertModel.from_pretrained(pretrained_model_name_for_embeddings,
                                                    config=self.config).state_dict()

        # Get customized BERT config
        self.config.max_position_embeddings = max_tokenization_length
        self.config.num_hidden_layers = 0
        self.config.output_hidden_states = True

        # Get pretrained BERT model & all its learnable parameters
        self.bert = BertModel.from_pretrained(pretrained_model_name_for_embeddings,
                                              config=self.config)
        current_states_dict = self.bert.state_dict()

        # Assign matching parameters (weights and biases of ONLY embeddings)
        for param in current_states_dict.keys():
            if 'embedding' in param:
                current_states_dict[param] = all_states_dict[param]

        # Update parameters in extracted BERT model
        self.bert.load_state_dict(current_states_dict)

        logging.info('Loaded %d learnable parameters from pretrained BERT model with %d layer(s)' %
                     (len(list(self.bert.parameters())), 0))

        # Dropout to prevent overfitting
        self.dropout = nn.Dropout(p=dropout_rate)

        # Recurrent Layer
        self.lstm = nn.LSTM(input_size=self.config.hidden_size,
                            hidden_size=hidden_size,
                            num_layers=num_recurrent_layers,
                            bidirectional=use_bidirectional,
                            batch_first=True)

        # Dense Layer for Classification
        self.clf = nn.Linear(in_features=hidden_size * 2 if self.use_bidirectional else hidden_size,
                             out_features=num_classes) 
Example #19
Source File: train_abstractive.py    From PreSumm with MIT License 4 votes vote down vote up
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' % args.load_from_extractive)
        bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
                                      shuffle=True, is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)