Python pytorch_pretrained_bert.BertTokenizer.from_pretrained() Examples
The following are 30
code examples of pytorch_pretrained_bert.BertTokenizer.from_pretrained().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pytorch_pretrained_bert.BertTokenizer
, or try the search function
.
Example #1
Source File: utils_unified.py From interpret-text with MIT License | 6 votes |
def _get_single_embedding(model, text, device): """Get the bert embedding for a single sentence :param text: The current sentence :type text: str :param device: A pytorch device :type device: torch.device :param model: a pytorch model :type model: torch.nn :return: A bert embedding of the single sentence :rtype: torch.embedding """ tokenizer = BertTokenizer.from_pretrained(Language.ENGLISH) words = [BertTokens.CLS] + tokenizer.tokenize(text) + [BertTokens.SEP] tokenized_ids = tokenizer.convert_tokens_to_ids(words) token_tensor = torch.tensor([tokenized_ids], device=device) embedding = model.bert.embeddings(token_tensor)[0] return embedding, words
Example #2
Source File: bert_dictionary.py From ParlAI with MIT License | 6 votes |
def __init__(self, opt): super().__init__(opt) # initialize from vocab path download(opt['datapath']) vocab_path = os.path.join(opt['datapath'], 'models', 'bert_models', VOCAB_PATH) self.tokenizer = BertTokenizer.from_pretrained(vocab_path) self.start_token = '[CLS]' self.end_token = '[SEP]' self.null_token = '[PAD]' self.start_idx = self.tokenizer.convert_tokens_to_ids(['[CLS]'])[ 0 ] # should be 101 self.end_idx = self.tokenizer.convert_tokens_to_ids(['[SEP]'])[ 0 ] # should be 102 self.pad_idx = self.tokenizer.convert_tokens_to_ids(['[PAD]'])[0] # should be 0 # set tok2ind for special tokens self.tok2ind[self.start_token] = self.start_idx self.tok2ind[self.end_token] = self.end_idx self.tok2ind[self.null_token] = self.pad_idx # set ind2tok for special tokens self.ind2tok[self.start_idx] = self.start_token self.ind2tok[self.end_idx] = self.end_token self.ind2tok[self.pad_idx] = self.null_token
Example #3
Source File: bert_dictionary.py From neural_chat with MIT License | 6 votes |
def __init__(self, opt): super().__init__(opt) # initialize from vocab path download(opt['datapath']) vocab_path = os.path.join(opt['datapath'], 'models', 'bert_models', VOCAB_PATH) self.tokenizer = BertTokenizer.from_pretrained(vocab_path) self.start_token = '[CLS]' self.end_token = '[SEP]' self.null_token = '[PAD]' self.start_idx = self.tokenizer.convert_tokens_to_ids(['[CLS]'])[ 0 ] # should be 101 self.end_idx = self.tokenizer.convert_tokens_to_ids(['[SEP]'])[ 0 ] # should be 102 self.pad_idx = self.tokenizer.convert_tokens_to_ids(['[PAD]'])[0] # should be 0 # set tok2ind for special tokens self.tok2ind[self.start_token] = self.start_idx self.tok2ind[self.end_token] = self.end_idx self.tok2ind[self.null_token] = self.pad_idx # set ind2tok for special tokens self.ind2tok[self.start_idx] = self.start_token self.ind2tok[self.end_idx] = self.end_token self.ind2tok[self.pad_idx] = self.null_token
Example #4
Source File: data_loader.py From NER-BERT-pytorch with MIT License | 6 votes |
def __init__(self, data_dir, bert_model_dir, params, token_pad_idx=0): self.data_dir = data_dir self.batch_size = params.batch_size self.max_len = params.max_len self.device = params.device self.seed = params.seed self.token_pad_idx = 0 tags = self.load_tags() self.tag2idx = {tag: idx for idx, tag in enumerate(tags)} self.idx2tag = {idx: tag for idx, tag in enumerate(tags)} params.tag2idx = self.tag2idx params.idx2tag = self.idx2tag self.tag_pad_idx = self.tag2idx['O'] self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir, do_lower_case=True)
Example #5
Source File: qa_sampler.py From semanticRetrievalMRS with MIT License | 6 votes |
def inspect_sampler_squad_examples(): bert_model_name = "bert-base-uncased" bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' do_lower_case = True max_pre_context_length = 315 max_query_length = 64 doc_stride = 128 debug = True tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) squad_train_v2 = common.load_json(config.SQUAD_TRAIN_2_0) train_eitem_list = preprocessing_squad(squad_train_v2) train_fitem_dict, train_fitem_list = eitems_to_fitems(train_eitem_list, tokenizer, is_training=False, max_tokens_for_doc=max_pre_context_length, doc_stride=doc_stride, debug=debug) print(len(train_fitem_list))
Example #6
Source File: model.py From transfer-nlp with MIT License | 6 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): super().__init__() self.initializer_range: float = initializer_range self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.num_layers = num_layers self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.lm_head = torch.nn.Linear(embed_dim, num_embeddings, bias=False) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights) self.tie_weights()
Example #7
Source File: model.py From transfer-nlp with MIT License | 6 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_embeddings: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool): super().__init__() self.causal: bool = causal self.tokens_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_embeddings, embed_dim) self.position_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_max_positions, embed_dim) self.dropout: torch.nn.Dropout = torch.nn.Dropout(dropout) self.attentions, self.feed_forwards = torch.nn.ModuleList(), torch.nn.ModuleList() self.layer_norms_1, self.layer_norms_2 = torch.nn.ModuleList(), torch.nn.ModuleList() for _ in range(num_layers): self.attentions.append(torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)) self.feed_forwards.append(torch.nn.Sequential(torch.nn.Linear(embed_dim, hidden_dim), torch.nn.ReLU(), torch.nn.Linear(hidden_dim, embed_dim))) self.layer_norms_1.append(torch.nn.LayerNorm(embed_dim, eps=1e-12)) self.layer_norms_2.append(torch.nn.LayerNorm(embed_dim, eps=1e-12)) self.attn_mask = None self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
Example #8
Source File: model.py From transfer-nlp with MIT License | 6 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): super().__init__() self.initializer_range: float = initializer_range self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.num_layers = num_layers self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.lm_head = torch.nn.Linear(embed_dim, num_embeddings, bias=False) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights) self.tie_weights()
Example #9
Source File: model.py From transfer-nlp with MIT License | 6 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_embeddings: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool): super().__init__() self.causal: bool = causal self.tokens_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_embeddings, embed_dim) self.position_embeddings: torch.nn.Embedding = torch.nn.Embedding(num_max_positions, embed_dim) self.dropout: torch.nn.Dropout = torch.nn.Dropout(dropout) self.attentions, self.feed_forwards = torch.nn.ModuleList(), torch.nn.ModuleList() self.layer_norms_1, self.layer_norms_2 = torch.nn.ModuleList(), torch.nn.ModuleList() for _ in range(num_layers): self.attentions.append(torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)) self.feed_forwards.append(torch.nn.Sequential(torch.nn.Linear(embed_dim, hidden_dim), torch.nn.ReLU(), torch.nn.Linear(hidden_dim, embed_dim))) self.layer_norms_1.append(torch.nn.LayerNorm(embed_dim, eps=1e-12)) self.layer_norms_2.append(torch.nn.LayerNorm(embed_dim, eps=1e-12)) self.attn_mask = None self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
Example #10
Source File: input_embedding.py From dstc8-meta-dialog with MIT License | 5 votes |
def model(self): """lazy model loading""" with MODEL_DOWNLOAD_LOCK: # use lock to ensure model isn't downloaded by two processes at once if not getattr(self, "_model", None): self._model = BertModel.from_pretrained('bert-base-uncased') self._model.eval() assert self._model.config.hidden_size == self.embed_dim if cuda_utils.CUDA_ENABLED and self.use_cuda_if_available: self._model.cuda() return self._model
Example #11
Source File: input_embedding.py From dstc8-meta-dialog with MIT License | 5 votes |
def tokenizer(self): """lazy model loading""" with MODEL_DOWNLOAD_LOCK: # use lock to ensure model isn't downloaded by two processes at once if not getattr(self, "_tokenizer", None): self._tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.pad_token = '[PAD]' self.bos_token = '[BOS]' self.eos_token = '[EOS]' self.unk_token = '[UNK]' self.pad_idx = self._tokenizer.vocab[self.pad_token] self.unk_idx = self._tokenizer.vocab[self.unk_token] # add EOS and BOS tokens to vocab by reusing unused slots self._tokenizer.basic_tokenizer.never_split += (self.eos_token, self.bos_token) vocab = self._tokenizer.vocab oldkey, newkey = '[unused1]', self.bos_token vocab = OrderedDict((newkey if k == oldkey else k, v) for k, v in vocab.items()) oldkey, newkey = '[unused2]', self.eos_token vocab = OrderedDict((newkey if k == oldkey else k, v) for k, v in vocab.items()) self._tokenizer.vocab = vocab self._tokenizer.wordpiece_tokenizer.vocab = vocab self.bos_idx = vocab[self.bos_token] self.eos_idx = vocab[self.eos_token] ids_to_tokens = OrderedDict( [(ids, tok) for tok, ids in vocab.items()]) self._tokenizer.ids_to_tokens = ids_to_tokens self._tokenizer.wordpiece_tokenizer.ids_to_tokens = ids_to_tokens return self._tokenizer
Example #12
Source File: data_builder.py From BertSum with Apache License 2.0 | 5 votes |
def __init__(self, args): self.args = args self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.sep_vid = self.tokenizer.vocab['[SEP]'] self.cls_vid = self.tokenizer.vocab['[CLS]'] self.pad_vid = self.tokenizer.vocab['[PAD]']
Example #13
Source File: bert_pretrained_encoder.py From lale with Apache License 2.0 | 5 votes |
def __init__(self, batch_size = 32): # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.max_seq_length = self.tokenizer.max_len self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = BertModel.from_pretrained('bert-base-uncased') self.batch_size = batch_size # def fit(self, X, y): # # TODO: Find the right value for max sequence length # return BertPretrainedEncoderImpl()
Example #14
Source File: BERT_Model.py From bert-sense with MIT License | 5 votes |
def __init__(self, device_number='cuda:2', use_cuda = True): self.device_number = device_number self.use_cuda = use_cuda self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') self.model = BertModel.from_pretrained('bert-large-uncased') self.model.eval() if use_cuda: self.model.to(device_number)
Example #15
Source File: bert_toeic.py From toeicbert with MIT License | 5 votes |
def solve(row, bertmodel='bert-base-uncased'): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") tokenizer = BertTokenizer.from_pretrained(bertmodel) model = BertForMaskedLM.from_pretrained(bertmodel).to(device) model.eval() question = re.sub('\_+', ' [MASK] ', to_clean(row['question'])) question_tokens = tokenizer.tokenize(question) masked_index = question_tokens.index('[MASK]') # make segment which is divided with sentence A or B, but we set all '0' as sentence A segment_ids = [0] * len(question_tokens) segment_tensors = torch.tensor([segment_ids]).to(device) # question tokens convert to ids and tensors question_ids = tokenizer.convert_tokens_to_ids(question_tokens) question_tensors = torch.tensor([question_ids]).to(device) candidates = [to_clean(row['1']), to_clean(row['2']), to_clean(row['3']), to_clean(row['4'])] predict_tensor = torch.tensor([get_score(model, tokenizer, question_tensors, segment_tensors, masked_index, candidate) for candidate in candidates]) predict_idx = torch.argmax(predict_tensor).item() if 'answer' in row: show(row['question'], candidates, predict_idx, row['answer']) else: show(row['question'], candidates, predict_idx, None)
Example #16
Source File: qa_sampler.py From semanticRetrievalMRS with MIT License | 5 votes |
def inspect_upstream_eval_v1(): bert_model_name = "bert-base-uncased" bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' do_lower_case = True max_pre_context_length = 315 max_query_length = 64 doc_stride = 128 is_training = True debug_mode = True d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT) in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl' cur_eval_results_list = common.load_jsonl(in_file_name) top_k = 10 filter_value = 0.1 match_type = 'string' tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) fitems_dict, read_fitems_list, _ = get_open_qa_item_with_upstream_paragraphs(d_list, cur_eval_results_list, is_training, tokenizer, max_pre_context_length, max_query_length, doc_stride, debug_mode, top_k, filter_value, match_type) print(len(read_fitems_list)) print(len(fitems_dict))
Example #17
Source File: trainer.py From mrqa with Apache License 2.0 | 5 votes |
def __init__(self, args): self.args = args self.set_random_seed(random_seed=args.random_seed) self.tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.debug: print("Debugging mode on.") self.features_lst = self.get_features(self.args.train_folder, self.args.debug)
Example #18
Source File: bert_model_runtime.py From botbuilder-python with MIT License | 5 votes |
def _load_model(self) -> None: self.device = torch.device( "cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu" ) self.n_gpu = torch.cuda.device_count() # Load a trained model and vocabulary that you have fine-tuned self.model = BertForSequenceClassification.from_pretrained( self.model_dir, num_labels=self.num_labels ) self.tokenizer = BertTokenizer.from_pretrained( self.model_dir, do_lower_case=self.do_lower_case ) self.model.to(self.device)
Example #19
Source File: prebert.py From ner_with_dependency with GNU General Public License v3.0 | 5 votes |
def load_bert(): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertModel.from_pretrained('bert-base-cased') model.eval() model.to(device) return tokenizer, model
Example #20
Source File: dataHelper.py From TextClassificationBenchmark with MIT License | 5 votes |
def load_vocab_from_bert(bert_base): bert_vocab_dir = os.path.join(bert_base,"vocab.txt") alphabet = Alphabet(start_feature_id = 0,alphabet_type="bert") from pytorch_pretrained_bert import BertTokenizer # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(bert_vocab_dir) for index,word in tokenizer.ids_to_tokens.items(): alphabet.add(word) return alphabet,tokenizer
Example #21
Source File: bert_servant.py From combine-FEVER-NSMN with MIT License | 5 votes |
def __init__(self, bert_type_name='') -> None: super().__init__() self.bert_type_name = bert_type_name self.bert_tokenizer = BertTokenizer.from_pretrained(self.bert_type_name) self.bert_model: BertModel = BertModel.from_pretrained(self.bert_type_name) self.bert_model.eval()
Example #22
Source File: bert_tokenizer.py From sciwing with MIT License | 5 votes |
def __init__(self, bert_type: str, do_basic_tokenize=True): super(TokenizerForBert, self).__init__() self.bert_type = bert_type self.do_basic_tokenize = do_basic_tokenize self.msg_printer = wasabi.Printer() self.allowed_bert_types = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "scibert-base-cased", "scibert-sci-cased", "scibert-base-uncased", "scibert-sci-uncased", ] self.scibert_foldername_mapping = { "scibert-base-cased": "scibert_basevocab_cased", "scibert-sci-cased": "scibert_scivocab_cased", "scibert-base-uncased": "scibert_basevocab_uncased", "scibert-sci-uncased": "scibert_scivocab_uncased", } assert bert_type in self.allowed_bert_types, self.msg_printer.fail( f"You passed {bert_type} for attribute bert_type." f"The allowed types are {self.allowed_bert_types}" ) self.vocab_type_or_filename = None if "scibert" in self.bert_type: foldername = self.scibert_foldername_mapping[self.bert_type] self.vocab_type_or_filename = os.path.join( EMBEDDING_CACHE_DIR, foldername, "vocab.txt" ) else: self.vocab_type_or_filename = self.bert_type with self.msg_printer.loading("Loading Bert model"): self.tokenizer = BertTokenizer.from_pretrained( self.vocab_type_or_filename, do_basic_tokenize=do_basic_tokenize )
Example #23
Source File: parse_nk.py From self-attentive-parser with MIT License | 5 votes |
def get_bert(bert_model, bert_do_lower_case): # Avoid a hard dependency on BERT by only importing it if it's being used from pytorch_pretrained_bert import BertTokenizer, BertModel if bert_model.endswith('.tar.gz'): tokenizer = BertTokenizer.from_pretrained(bert_model.replace('.tar.gz', '-vocab.txt'), do_lower_case=bert_do_lower_case) else: tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=bert_do_lower_case) bert = BertModel.from_pretrained(bert_model) return tokenizer, bert # %%
Example #24
Source File: bert.py From transfer-nlp with MIT License | 5 votes |
def bert_model(pretrained_model_name_or_path: str = 'bert-base-uncased', num_labels: int = 4): return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, num_labels=num_labels)
Example #25
Source File: bert.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, data_file: str, bert_version: str): super().__init__(data_file=data_file) self.tokenizer = BertTokenizer.from_pretrained(bert_version) df = pd.read_csv(data_file) self.target_vocab = Vocabulary(add_unk=False) self.target_vocab.add_many(set(df.category))
Example #26
Source File: model.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, adapters_dim: int, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): """ Transformer with a classification head and adapters. """ super().__init__() self.initializer_range: float = initializer_range self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.num_layers = num_layers self.transformer: TransformerWithAdapters = TransformerWithAdapters(adapters_dim, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights)
Example #27
Source File: model.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): super().__init__() self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.initializer_range = initializer_range self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights)
Example #28
Source File: model.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float): """ Transformer with a language modeling head on top (tied weights) """ super().__init__() tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(tokenizer.vocab) self.initializer_range = initializer_range self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.lm_head = torch.nn.Linear(embed_dim, num_embeddings, bias=False) self.apply(self.init_weights) self.tie_weights()
Example #29
Source File: model.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, adapters_dim: int, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): """ Transformer with a classification head and adapters. """ super().__init__() self.initializer_range: float = initializer_range self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.num_layers = num_layers self.transformer: TransformerWithAdapters = TransformerWithAdapters(adapters_dim, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights)
Example #30
Source File: model.py From transfer-nlp with MIT License | 5 votes |
def __init__(self, embed_dim: int, hidden_dim: int, num_max_positions: int, num_heads: int, num_layers: int, dropout: float, causal: bool, initializer_range: float, num_classes: int): super().__init__() self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) num_embeddings = len(self.tokenizer.vocab) self.initializer_range = initializer_range self.transformer = Transformer(embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal=causal) self.classification_head = torch.nn.Linear(embed_dim, num_classes) self.apply(self.init_weights)