Python pytorch_pretrained_bert.tokenization.BertTokenizer.from_pretrained() Examples

The following are 30 code examples of pytorch_pretrained_bert.tokenization.BertTokenizer.from_pretrained(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pytorch_pretrained_bert.tokenization.BertTokenizer , or try the search function .
Example #1
Source File: gobbli_batcher.py    From gobbli with Apache License 2.0 6 votes vote down vote up
def __init__(self, path, batch_size=32, gpu=True, labels=None,
                 has_labels=True, is_train=True, dropout_w=0.005, maxlen=128):
        self.batch_size = batch_size
        self.has_labels = has_labels
        self.gpu = gpu
        self.labels = labels
        self.is_train = is_train
        # Explicit cache dir required for some reason -- default doesn't exist in the docker
        # container, maybe?
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='/tmp')
        self.data = self.load(path, maxlen, has_labels)
        if self.is_train:
            indices = list(range(len(self.data)))
            random.shuffle(indices)
            data = [self.data[i] for i in indices]
        self.data = GobbliBatchGen.make_batches(self.data, batch_size)
        self.offset = 0
        self.dropout_w = dropout_w 
Example #2
Source File: preprocess_embedding.py    From curriculum with GNU General Public License v3.0 6 votes vote down vote up
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    args.use_gpu = use_gpu

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    bert_model = BertModel.from_pretrained("bert-base-chinese")

    if use_gpu:
        bert_model = bert_model.cuda()

    processor = Preprocess(args, tokenizer, bert_model)
    processor.do_preprocess() 
Example #3
Source File: data_loader.py    From conv-emotion with MIT License 6 votes vote down vote up
def __init__(self, conversations, labels, conversation_length, sentence_length, data=None):

        # [total_data_size, max_conversation_length, max_sentence_length]
        # tokenized raw text of sentences
        self.conversations = conversations
        self.labels = labels

        # conversation length of each batch
        # [total_data_size]
        self.conversation_length = conversation_length

        # list of length of sentences
        # [total_data_size, max_conversation_length]
        self.sentence_length = sentence_length
        self.data = data
        self.len = len(conversations)

        # Prepare for BERT
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
        self.prepare_BERT() 
Example #4
Source File: bert_pretrained.py    From udify with MIT License 6 votes vote down vote up
def __init__(self, pretrained_model: str,
                 requires_grad: bool = False,
                 dropout: float = 0.1,
                 layer_dropout: float = 0.1,
                 combine_layers: str = "mix") -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model,
                         layer_dropout=layer_dropout,
                         combine_layers=combine_layers)

        self.model = model
        self.dropout = dropout
        self.set_dropout(dropout) 
Example #5
Source File: predictor.py    From ConvLab with MIT License 6 votes vote down vote up
def __init__(self, archive_file, model_file=None, use_cuda=False):
        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for DA-predictor is specified!")
            archive_file = cached_path(model_file)
        model_dir = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(os.path.join(model_dir, 'checkpoints')):
            archive = zipfile.ZipFile(archive_file, 'r')
            archive.extractall(model_dir)
        
        load_dir = os.path.join(model_dir, "checkpoints/predictor/save_step_15120")
        if not os.path.exists(load_dir):
            archive = zipfile.ZipFile(f'{load_dir}.zip', 'r')
            archive.extractall(os.path.dirname(load_dir))
        
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
        self.max_seq_length = 256
        self.domain = 'restaurant'
        self.model = BertForSequenceClassification.from_pretrained(load_dir, 
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)), num_labels=44)
        self.device = 'cuda' if use_cuda else 'cpu'
        self.model.to(self.device) 
Example #6
Source File: utils_bert.py    From interpret-text with MIT License 6 votes vote down vote up
def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
        """Initializes the classifier and the underlying pretrained model.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to Language.ENGLISH.
            num_labels (int, optional): The number of unique labels in the
                training data. Defaults to 2.
            cache_dir (str, optional): Location of BERT's cache directory.
                Defaults to ".".
        """
        if num_labels < 2:
            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
        self.cache_dir = cache_dir

        # create classifier
        self.model = BertForSequenceClassification.from_pretrained(
            language, cache_dir=cache_dir, num_labels=num_labels
        )
        self.has_cuda = self.cuda 
Example #7
Source File: run_cail.py    From cail2019 with Apache License 2.0 5 votes vote down vote up
def _test(args, device, n_gpu):
    model = CailModel.from_pretrained(args.output_dir)
    tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)

    test_dataloader, test_examples, test_features = load_test_features(args, tokenizer)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    model.eval()
    logger.info("Start evaluating")
    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in test_dataloader:
        if len(all_results) % 5000 == 0:
            logger.info("Processing example: %d" % (len(all_results)))
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits, \
            batch_unk_logits, batch_yes_logits, batch_no_logits = model(input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            unk_logits = batch_unk_logits[i].detach().cpu().tolist()
            yes_logits = batch_yes_logits[i].detach().cpu().tolist()
            no_logits = batch_no_logits[i].detach().cpu().tolist()
            test_feature = test_features[example_index.item()]
            unique_id = int(test_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                         start_logits=start_logits,
                                         end_logits=end_logits,
                                         unk_logits=unk_logits,
                                         yes_logits=yes_logits,
                                         no_logits=no_logits))
    output_prediction_file = os.path.join(args.output_dir, "predictions_test.json")
    write_predictions_test(test_examples, test_features, all_results, args.n_best_size, args.max_answer_length,
                                        args.do_lower_case, output_prediction_file, args.verbose_logging,
                           args.version_2_with_negative, args.null_score_diff_threshold) 
Example #8
Source File: extractor.py    From mt-dnn with MIT License 5 votes vote down vote up
def process_data(args):
    tokenizer = BertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=args.do_lower_case)
    path = args.finput
    data, is_single_sentence = load_data(path)
    if is_single_sentence:
        tokened_data = build_data_single(
            data, max_seq_len=args.max_seq_length, tokenizer=tokenizer)
    else:
        tokened_data = build_data(
            data,
            max_seq_len=args.max_seq_length,
            tokenizer=tokenizer)
    return tokened_data, is_single_sentence 
Example #9
Source File: preprocess.py    From curriculum with GNU General Public License v3.0 5 votes vote down vote up
def main():

    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

    processor = Preprocess(args, tokenizer)
    processor.do_preprocess()

    # with open("/Users/limingwei/Desktop/MG1833039.txt", "r") as f:
    #     lines = f.readlines()
    # label = [0, 0]
    # for line in lines:
    #     l = line.strip().split("\t")
    #     label[int(l[1])] += 1
    # print(label) 
Example #10
Source File: sumbt.py    From tatk with Apache License 2.0 5 votes vote down vote up
def initialize_slot_value_lookup(self, label_ids, slot_ids):

        self.sv_encoder.eval()

        # Slot encoding
        slot_type_ids = torch.zeros(slot_ids.size(), dtype=torch.long).to(self.device)
        slot_mask = slot_ids > 0
        hid_slot, _ = self.sv_encoder(slot_ids.view(-1, self.max_label_length),
                                      slot_type_ids.view(-1, self.max_label_length),
                                      slot_mask.view(-1, self.max_label_length),
                                      output_all_encoded_layers=False)
        hid_slot = hid_slot[:, 0, :]
        hid_slot = hid_slot.detach()
        self.slot_lookup = nn.Embedding.from_pretrained(hid_slot, freeze=True)

        for s, label_id in enumerate(label_ids):
            label_type_ids = torch.zeros(label_id.size(), dtype=torch.long).to(self.device)
            label_mask = label_id > 0
            hid_label, _ = self.sv_encoder(label_id.view(-1, self.max_label_length),
                                           label_type_ids.view(-1, self.max_label_length),
                                           label_mask.view(-1, self.max_label_length),
                                           output_all_encoded_layers=False)
            hid_label = hid_label[:, 0, :]
            hid_label = hid_label.detach()
            self.value_lookup[s] = nn.Embedding.from_pretrained(hid_label, freeze=True)
            self.value_lookup[s].padding_idx = -1

        print("Complete initialization of slot and value lookup") 
Example #11
Source File: sumbt.py    From tatk with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.belief_tracker = BeliefTracker()
        self.batch = None  # generated with dataloader
        self.current_turn = 0
        self.idx2slot = {}
        self.idx2value = {}  # slot value for each slot, use processor.get_labels()

        if DEVICE == 'cuda':
            if not torch.cuda.is_available():
                raise ValueError('cuda not available')
            n_gpu = torch.cuda.device_count()
            if n_gpu < N_GPU:
                raise ValueError('gpu not enough')

        print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(DEVICE, n_gpu, bool(N_GPU > 1), FP16))


        # Get Processor
        self.processor = Processor()
        self.label_list = self.processor.get_labels()
        self.num_labels = [len(labels) for labels in self.label_list]  # number of slot-values in each slot-type
        self.belief_tracker.init_session(self.num_labels)
        if N_GPU > 1:
            self.belief_tracker = torch.nn.DataParallel(self.belief_tracker)

        # tokenizer
        vocab_dir = os.path.join(BERT_DIR, 'vocab.txt')
        if not os.path.exists(vocab_dir):
            raise ValueError("Can't find %s " % vocab_dir)
        self.tokenizer = BertTokenizer.from_pretrained(vocab_dir, do_lower_case=DO_LOWER_CASE)

        self.num_train_steps = None
        self.accumulation = False 
Example #12
Source File: iterator.py    From mrqa with Apache License 2.0 5 votes vote down vote up
def save_features(args):
    """
    Saving preprocessed features as pickle file
    """
    filename, args = args
    print(filename)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    data_name = filename.split(".")[0]
    pickled_folder = args.pickled_folder + "_{}_{}".format(args.bert_model, str(args.skip_no_ans))
    pickle_file_name = data_name + '.pkl'
    pickle_file_path = os.path.join(pickled_folder, pickle_file_name)

    if os.path.exists(pickle_file_path):  # Check whether pkl file already exists
        return
    else:
        print("[Processing {} file...]".format(data_name))
        file_path = os.path.join(args.train_folder, filename)

        train_examples = read_squad_examples(file_path, debug=args.debug)
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            max_query_length=args.max_query_length,
            doc_stride=args.doc_stride,
            is_training=True,
            skip_no_ans=args.skip_no_ans
        )

        # Save feature lst as pickle (For reuse & fast loading)
        with open(pickle_file_path, 'wb') as pkl_f:
            print("Saving {} file from pkl file...".format(data_name))
            pickle.dump(train_features, pkl_f)

        print("[{} saving done]".format(filename)) 
Example #13
Source File: iterator.py    From mrqa with Apache License 2.0 5 votes vote down vote up
def iter_test(file_name):
    """
    This is just a test code!
    """
    config = Config()
    tokenizer = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case)
    train_examples = read_squad_examples('./data/train/{}.jsonl.gz'.format(file_name))

    """
    ***** Running training *****
        Num orig examples = 34287
        Num split examples = 35111 (train_features)
    length of examples and features are different but difficulty function is based on examples 
    """

    train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=config.max_seq_length,
        max_query_length=config.max_query_length,
        doc_stride=config.doc_stride,
        is_training=True)

    for i in range(10):
        print(train_features[i].input_ids)

    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(train_examples))
    logger.info("  Num split examples = %d", len(train_features)) 
Example #14
Source File: model.py    From MAX-Toxic-Comment-Classifier with Apache License 2.0 5 votes vote down vote up
def __init__(self, path=DEFAULT_MODEL_PATH):
        """Instantiate the BERT model."""
        logger.info('Loading model from: {}...'.format(path))

        # Load the model
        # 1. set the appropriate parameters
        self.eval_batch_size = 64
        self.max_seq_length = 256
        self.do_lower_case = True

        # 2. Initialize the PyTorch model
        model_state_dict = torch.load(DEFAULT_MODEL_PATH+'pytorch_model.bin', map_location='cpu')
        self.tokenizer = BertTokenizer.from_pretrained(DEFAULT_MODEL_PATH, do_lower_case=self.do_lower_case)
        self.model = BertForMultiLabelSequenceClassification.from_pretrained(DEFAULT_MODEL_PATH,
                                                                             num_labels=len(LABEL_LIST),
                                                                             state_dict=model_state_dict)
        self.device = torch.device("cpu")
        self.model.to(self.device)

        # 3. Set the layers to evaluation mode
        self.model.eval()

        logger.info('Loaded model') 
Example #15
Source File: tokenlizer.py    From cudaBERT with Apache License 2.0 5 votes vote down vote up
def init_tokenlizer(vocab_file, do_lower_case):
    global tokenizer
    tokenizer = BertTokenizer.from_pretrained(\
                    vocab_file, do_lower_case=do_lower_case) 
Example #16
Source File: bert_tokenizer_and_candidate_generator.py    From kb with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 entity_candidate_generators: Dict[str, MentionGenerator],
                 entity_indexers: Dict[str, TokenIndexer],
                 bert_model_type: str,
                 do_lower_case: bool,
                 whitespace_tokenize: bool = True,
                 max_word_piece_sequence_length: int = 512) -> None:
        """
        Note: the fields need to be used with a pre-generated allennlp vocabulary
        that contains the entity id namespaces and the bert name space.
        entity_indexers = {'wordnet': indexer for wordnet entities,
                          'wiki': indexer for wiki entities}
        """
        # load BertTokenizer from huggingface
        self.candidate_generators = entity_candidate_generators
        self.bert_tokenizer = BertTokenizer.from_pretrained(
            bert_model_type, do_lower_case=do_lower_case
        )
        self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False)
        # Target length should include start and end token
        self.max_word_piece_sequence_length = max_word_piece_sequence_length

        self._entity_indexers = entity_indexers
        # for bert, we'll give an empty token indexer with empty name space
        # and do the indexing directly with the bert vocab to bypass
        # indexing in the indexer
        self._bert_single_id_indexer = {'tokens': SingleIdTokenIndexer('__bert__')}
        self.do_lowercase = do_lower_case
        self.whitespace_tokenize = whitespace_tokenize
        self.dtype = np.float32 
Example #17
Source File: run_cail.py    From cail2019 with Apache License 2.0 5 votes vote down vote up
def save_model(args, model, tokenizer):
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    logger.info("save model")
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(args.output_dir) 
Example #18
Source File: approx.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def load_bert(args):
  # load bert tokenizer and model
  tokenizer = BertTokenizer.from_pretrained(args.bert_model, 
              do_lower_case=True,
              cache_dir=args.cache_dir)
  pretrained_model = BertModel.from_pretrained(args.bert_model, 
              cache_dir=args.cache_dir)
  return tokenizer, pretrained_model

# role scheme generator 
Example #19
Source File: cogqa.py    From CogQA with MIT License 5 votes vote down vote up
def main(BERT_MODEL='bert-base-uncased', model_file='./models/bert-base-uncased.bin', data_file='./hotpot_dev_distractor_v1.json', max_new_nodes=5):
    setting = 'distractor' if data_file.find('distractor') >= 0 else 'fullwiki'
    with open(data_file, 'r') as fin:
        dataset = json.load(fin)
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
    device = torch.device('cpu') if not torch.cuda.is_available() else torch.device('cuda')
    print('Loading model from {}'.format(model_file))
    model_state_dict = torch.load(model_file)
    model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL, state_dict=model_state_dict['params1'])
    model2 = CognitiveGNN(model1.config.hidden_size)
    model2.load_state_dict(model_state_dict['params2'])
    sp, answer, graphs = {}, {}, {}
    print('Start Training... on {} GPUs'.format(torch.cuda.device_count()))
    model1 = torch.nn.DataParallel(model1, device_ids = range(torch.cuda.device_count()))
    model1.to(device).eval()
    model2.to(device).eval()

    with torch.no_grad():
        for data in tqdm(dataset):
            gold, ans, graph_ret, ans_nodes = cognitive_graph_propagate(tokenizer, data, model1, model2, device, setting = setting, max_new_nodes=max_new_nodes)
            sp[data['_id']] = list(gold)
            answer[data['_id']] = ans
            graphs[data['_id']] = graph_ret + ['answer_nodes: ' + ', '.join(ans_nodes)]
    pred_file = data_file.replace('.json', '_pred.json')
    with open(pred_file, 'w') as fout:
        json.dump({'answer': answer, 'sp': sp, 'graphs': graphs}, fout) 
Example #20
Source File: train.py    From CogQA with MIT License 5 votes vote down vote up
def main(output_model_file = './models/bert-base-uncased.bin', load = False, mode = 'tensors', batch_size = 12, 
            num_epoch = 1, gradient_accumulation_steps = 1, lr1 = 1e-4, lr2 = 1e-4, alpha = 0.2):
    
    BERT_MODEL = 'bert-base-uncased' # bert-large is too large for ordinary GPU on task #2
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
    with open('./hotpot_train_v1.1_refined.json' ,'r') as fin:
        dataset = json.load(fin)
    bundles = []
    for data in tqdm(dataset):
        try:
            bundles.append(convert_question_to_samples_bundle(tokenizer, data))
        except ValueError as err:
            pass
        # except Exception as err:
        #     traceback.print_exc()
        #     pass
    device = torch.device('cpu') if not torch.cuda.is_available() else torch.device('cuda')
    if load:
        print('Loading model from {}'.format(output_model_file))
        model_state_dict = torch.load(output_model_file)
        model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL, state_dict=model_state_dict['params1'])
        model2 = CognitiveGNN(model1.config.hidden_size)
        model2.load_state_dict(model_state_dict['params2'])

    else:
        model1 = BertForMultiHopQuestionAnswering.from_pretrained(BERT_MODEL,
                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1))
        model2 = CognitiveGNN(model1.config.hidden_size)

    print('Start Training... on {} GPUs'.format(torch.cuda.device_count()))
    model1 = torch.nn.DataParallel(model1, device_ids = range(torch.cuda.device_count()))
    model1, model2 = train(bundles, model1=model1, device=device, mode=mode, model2=model2, # Then pass hyperparams
        batch_size=batch_size, num_epoch=num_epoch, gradient_accumulation_steps=gradient_accumulation_steps,lr1=lr1, lr2=lr2, alpha=alpha)
    
    print('Saving model to {}'.format(output_model_file))
    saved_dict = {'params1' : model1.module.state_dict()}
    saved_dict['params2'] = model2.state_dict()
    torch.save(saved_dict, output_model_file) 
Example #21
Source File: run_asc.py    From BERT-for-RRC-ABSA with Apache License 2.0 5 votes vote down vote up
def test(args):  # Load a trained model that you have fine-tuned (we assume evaluate on cpu)    
    processor = data_utils.AscProcessor()
    label_list = processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    eval_examples = processor.get_test_examples(args.data_dir)
    eval_features = data_utils.convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, "asc")

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    model = torch.load(os.path.join(args.output_dir, "model.pt") )
    model.cuda()
    model.eval()
    
    full_logits=[]
    full_label_ids=[]
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.cuda() for t in batch)
        input_ids, segment_ids, input_mask, label_ids = batch
        
        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.cpu().numpy()

        full_logits.extend(logits.tolist() )
        full_label_ids.extend(label_ids.tolist() )

    output_eval_json = os.path.join(args.output_dir, "predictions.json") 
    with open(output_eval_json, "w") as fw:
        json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw) 
Example #22
Source File: extract_features.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def load(args):
  print('loading %s model'%args.bert_model)
  device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
  tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True, cache_dir=args.cache_dir)
  model = BertModel.from_pretrained(args.bert_model, cache_dir=args.cache_dir)
  model.to(device)
  if args.num_gpus > 1:
    model = torch.nn.DataParallel(model)
  model.eval()
  return model, tokenizer, device 
Example #23
Source File: batcher.py    From mt-dnn with MIT License 5 votes vote down vote up
def load(path, is_train=True, maxlen=512, factor=1.0, task_def=None, bert_model='bert-base-uncased', do_lower_case=True):
        task_type = task_def.task_type
        assert task_type is not None

        if task_type == TaskType.MaskLM:
            def load_mlm_data(path):
                from pytorch_pretrained_bert.tokenization import BertTokenizer
                tokenizer = BertTokenizer.from_pretrained(bert_model,
                                                          do_lower_case=do_lower_case)
                vocab_words = list(tokenizer.vocab.keys())
                data = load_loose_json(path)
                docs = []
                for doc in data:
                    paras = doc['text'].split('\n\n')
                    paras = [para.strip() for para in paras if len(para.strip()) > 0]
                    tokens = [tokenizer.tokenize(para) for para in paras]
                    docs.append(tokens)
                return docs, tokenizer
            return load_mlm_data(path)

        with open(path, 'r', encoding='utf-8') as reader:
            data = []
            cnt = 0
            for line in reader:
                sample = json.loads(line)
                sample['factor'] = factor
                cnt += 1
                if is_train:
                    task_obj = tasks.get_task_obj(task_def)
                    if task_obj is not None and not task_obj.input_is_valid_sample(sample, maxlen):
                        continue
                    if (task_type == TaskType.Ranking) and (len(sample['token_id'][0]) > maxlen or len(sample['token_id'][1]) > maxlen):
                        continue
                    if (task_type != TaskType.Ranking) and (len(sample['token_id']) > maxlen):
                        continue
                data.append(sample)
            print('Loaded {} samples out of {}'.format(len(data), cnt))
        return data, None 
Example #24
Source File: extract_features.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def load(args):
  print('loading %s model'%args.bert_model)
  device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
  tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True, cache_dir=args.cache_dir)
  model = BertModel.from_pretrained(args.bert_model, cache_dir=args.cache_dir)
  model.to(device)
  if args.num_gpus > 1:
    model = torch.nn.DataParallel(model)
  if args.untrained_bert:
    model.apply(init_weights)
  model.eval()
  return model, tokenizer, device 
Example #25
Source File: extract_features.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def load(args):
  print('loading %s model'%args.bert_model)
  device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
  tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True, cache_dir=args.cache_dir)
  model = BertModel.from_pretrained(args.bert_model, cache_dir=args.cache_dir)
  model.to(device)
  if args.num_gpus > 1:
    model = torch.nn.DataParallel(model)
  model.eval()
  return model, tokenizer, device 
Example #26
Source File: induce_dep_trees.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def load(args):
  print('loading %s model'%args.bert_model)
  device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
  tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True, cache_dir=args.cache_dir)
  model = BertModel.from_pretrained(args.bert_model, cache_dir=args.cache_dir)
  model.to(device)
  if args.num_gpus > 1:
    model = torch.nn.DataParallel(model)
  model.eval()
  return model, tokenizer, device 
Example #27
Source File: common.py    From nlp-recipes with MIT License 5 votes vote down vote up
def __init__(self, language=Language.ENGLISH, to_lower=False, cache_dir="."):
        """Initializes the underlying pretrained BERT tokenizer.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to Language.ENGLISH.
            cache_dir (str, optional): Location of BERT's cache directory.
                Defaults to ".".
        """
        self.tokenizer = BertTokenizer.from_pretrained(
            language, do_lower_case=to_lower, cache_dir=cache_dir
        )
        self.language = language 
Example #28
Source File: hubconf.py    From squash-generation with MIT License 5 votes vote down vote up
def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model 
Example #29
Source File: utils_bert.py    From interpret-text with MIT License 5 votes vote down vote up
def __init__(self, language=Language.ENGLISH, to_lower=False, cache_dir="."):
        """Initializes the underlying pretrained BERT tokenizer.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to Language.ENGLISH.
            cache_dir (str, optional): Location of BERT's cache directory.
                Defaults to ".".
        """
        self.tokenizer = BertTokenizer.from_pretrained(
            language, do_lower_case=to_lower, cache_dir=cache_dir
        )
        self.language = language 
Example #30
Source File: run_bert_tagger.py    From mrc-for-flat-nested-ner with Apache License 2.0 5 votes vote down vote up
def load_model(config, num_train_steps, label_list):
    # device = torch.device(torch.cuda.is_available())
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()
    model = BertTagger(config, num_labels=len(label_list)) 
    # model = BertForTagger.from_pretrained(config.bert_model, num_labels=13)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare  optimzier 
    param_optimizer = list(model.named_parameters())

        
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]

    # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) 
    optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) 

    return model, optimizer, device, n_gpu