Python transformers.BertTokenizer.from_pretrained() Examples
The following are 30
code examples of transformers.BertTokenizer.from_pretrained().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
transformers.BertTokenizer
, or try the search function
.
Example #1
Source File: test_transformers.py From keras-onnx with MIT License | 6 votes |
def test_TFXLNet(self): if enable_full_transformer_test: from transformers import XLNetConfig, TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \ TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple, XLNetTokenizer model_list = [TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \ TFXLNetForTokenClassification, TFXLNetForQuestionAnsweringSimple] else: from transformers import XLNetConfig, TFXLNetModel, XLNetTokenizer model_list = [TFXLNetModel] # XLNetTokenizer need SentencePiece, so the pickle file does not work here. tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') config = XLNetConfig(n_layer=2) # The model with input mask has MatrixDiagV3 which is not a registered function/op token = np.asarray(tokenizer.encode(self.text_str, add_special_tokens=True), dtype=np.int32) inputs_onnx = {'input_1': np.expand_dims(token, axis=0)} inputs = tf.constant(token)[None, :] # Batch size 1 for model_instance_ in model_list: keras.backend.clear_session() model = model_instance_(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
Example #2
Source File: dataloader.py From tatk with Apache License 2.0 | 6 votes |
def __init__(self, intent_vocab, tag_vocab, pretrained_weights): """ :param intent_vocab: list of all intents :param tag_vocab: list of all tags :param pretrained_weights: which bert, e.g. 'bert-base-uncased' """ self.intent_vocab = intent_vocab self.tag_vocab = tag_vocab self.intent_dim = len(intent_vocab) self.tag_dim = len(tag_vocab) self.id2intent = dict([(i, x) for i, x in enumerate(intent_vocab)]) self.intent2id = dict([(x, i) for i, x in enumerate(intent_vocab)]) self.id2tag = dict([(i, x) for i, x in enumerate(tag_vocab)]) self.tag2id = dict([(x, i) for i, x in enumerate(tag_vocab)]) self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights) self.data = {} self.intent_weight = [1] * len(self.intent2id)
Example #3
Source File: dataloader.py From ConvLab with MIT License | 6 votes |
def __init__(self, intent_vocab, tag_vocab, pretrained_weights): """ :param intent_vocab: list of all intents :param tag_vocab: list of all tags :param pretrained_weights: which bert, e.g. 'bert-base-uncased' """ self.intent_vocab = intent_vocab self.tag_vocab = tag_vocab self.intent_dim = len(intent_vocab) self.tag_dim = len(tag_vocab) self.id2intent = dict([(i, x) for i, x in enumerate(intent_vocab)]) self.intent2id = dict([(x, i) for i, x in enumerate(intent_vocab)]) self.id2tag = dict([(i, x) for i, x in enumerate(tag_vocab)]) self.tag2id = dict([(x, i) for i, x in enumerate(tag_vocab)]) self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights) self.data = {} self.intent_weight = [1] * len(self.intent2id)
Example #4
Source File: bert_tf_to_pytorch.py From inference with Apache License 2.0 | 6 votes |
def save_to_onnx(model): tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") model.eval() dummy_input = torch.ones((1, 384), dtype=torch.int64) torch.onnx.export( model, (dummy_input, dummy_input, dummy_input), "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx", verbose=True, input_names = ["input_ids", "input_mask", "segment_ids"], output_names = ["output_start_logits", "output_end_logits"], opset_version=11, dynamic_axes=({"input_ids": {0: "batch_size"}, "input_mask": {0: "batch_size"}, "segment_ids": {0: "batch_size"}, "output_start_logits": {0: "batch_size"}, "output_end_logits": {0: "batch_size"}}) )
Example #5
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_emotion = download_model('bert.emotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_emotion = os.path.join(path_emotion,'bert.emotion') path_reject = download_model('bert.noemotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_reject = os.path.join(path_reject,'bert.noemotion') # load the models self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject) self.model_reject = BertForSequenceClassification.from_pretrained(path_reject) self.tokenizer = BertTokenizer.from_pretrained(path_emotion) self.model = BertForSequenceClassification.from_pretrained(path_emotion) # load the class names mapping self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese', 0: 'Glæde/Sindsro', 3: 'Overasket/Målløs', 1: 'Tillid/Accept', 4: 'Vrede/Irritation', 6: 'Sorg/trist', 7: 'Frygt/Bekymret'}
Example #6
Source File: sentence_encoder.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length, cat_entity_rep=False): nn.Module.__init__(self) self.bert = BertModel.from_pretrained(pretrain_path) self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.cat_entity_rep = cat_entity_rep
Example #7
Source File: bert_learner.py From medaCy with GNU General Public License v3.0 | 5 votes |
def load(self, path): """Load saved model and vectorizer. :param path: Path of directory where the model was saved. """ self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model) vectorizer_values = torch.load(path + '/vectorizer.pt') self.vectorizer = Vectorizer(device=self.device) self.vectorizer.load_values(vectorizer_values) model_class = BertCrfForTokenClassification if self.using_crf else BertForTokenClassification self.model = model_class.from_pretrained( path, num_labels=len(self.vectorizer.tag_to_index) - 1 # Ignore 'X' ) self.model = self.model.to(self.device)
Example #8
Source File: run_pplm_discrim_train.py From PPLM with Apache License 2.0 | 5 votes |
def __init__( self, class_size=None, pretrained_model="gpt2-medium", classifier_head=None, cached_mode=False, device='cpu' ): super(Discriminator, self).__init__() if pretrained_model.startswith("gpt2"): self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model) self.embed_size = self.encoder.transformer.config.hidden_size elif pretrained_model.startswith("bert"): self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) self.encoder = BertModel.from_pretrained(pretrained_model) self.embed_size = self.encoder.config.hidden_size else: raise ValueError( "{} model not yet supported".format(pretrained_model) ) if classifier_head: self.classifier_head = classifier_head else: if not class_size: raise ValueError("must specify class_size") self.classifier_head = ClassificationHead( class_size=class_size, embed_size=self.embed_size ) self.cached_mode = cached_mode self.device = device
Example #9
Source File: convert_to_presumm.py From summarus with Apache License 2.0 | 5 votes |
def __init__(self, bert_model, lower, max_src_tokens, max_tgt_tokens): self.max_src_tokens = max_src_tokens self.max_tgt_tokens = max_tgt_tokens self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower) self.sep_token = '[SEP]' self.cls_token = '[CLS]' self.pad_token = '[PAD]' self.tgt_bos = '[unused1] ' self.tgt_eos = ' [unused2]' self.tgt_sent_split = ' [unused3] ' self.sep_vid = self.tokenizer.vocab[self.sep_token] self.cls_vid = self.tokenizer.vocab[self.cls_token] self.pad_vid = self.tokenizer.vocab[self.pad_token]
Example #10
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer # download the model or load the model path weights_path = download_model('bert.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] self.model = AutoModelForTokenClassification.from_pretrained(weights_path) self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
Example #11
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose) path_sub = os.path.join(path_sub,'bert.sub.v0.0.1') path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func,verbose=verbose) path_pol = os.path.join(path_pol,'bert.pol.v0.0.1') self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub) self.model_sub = BertForSequenceClassification.from_pretrained(path_sub) self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol) self.model_pol = BertForSequenceClassification.from_pretrained(path_pol)
Example #12
Source File: bert_encoder.py From REDN with MIT License | 5 votes |
def __init__(self, max_length, pretrain_path, blank_padding=True): """ Args: max_length: max length of sentence pretrain_path: path of pretrain model """ super().__init__() self.max_length = max_length self.blank_padding = blank_padding self.hidden_size = 768 * 2 self.bert = BertModel.from_pretrained(pretrain_path) self.tokenizer = BertTokenizer.from_pretrained(pretrain_path) self.linear = nn.Linear(self.hidden_size, self.hidden_size)
Example #13
Source File: sentence_encoder.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) self.bert = BertForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Example #14
Source File: sentence_encoder.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length, cat_entity_rep=False): nn.Module.__init__(self) self.roberta = RobertaModel.from_pretrained(pretrain_path) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.cat_entity_rep = cat_entity_rep
Example #15
Source File: sentence_encoder.py From FewRel with MIT License | 5 votes |
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) self.roberta = RobertaForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Example #16
Source File: bert_encoder.py From REDN with MIT License | 5 votes |
def __init__(self, max_length, pretrain_path, blank_padding=True): """ Args: max_length: max length of sentence pretrain_path: path of pretrain model """ super().__init__() self.max_length = max_length self.blank_padding = blank_padding self.bert = BertModel.from_pretrained(pretrain_path) self.hidden_size = self.bert.config.hidden_size self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
Example #17
Source File: bert_encoder.py From REDN with MIT License | 5 votes |
def __init__(self, pretrain_path, blank_padding=True): super().__init__(80, pretrain_path, blank_padding) self.bert = BertModel.from_pretrained(pretrain_path, output_hidden_states=True,output_attentions=True)
Example #18
Source File: bert_encoder.py From OpenNRE with MIT License | 5 votes |
def __init__(self, max_length, pretrain_path, blank_padding=True, mask_entity=False): """ Args: max_length: max length of sentence pretrain_path: path of pretrain model """ super().__init__() self.max_length = max_length self.blank_padding = blank_padding self.hidden_size = 768 self.mask_entity = mask_entity logging.info('Loading BERT pre-trained checkpoint.') self.bert = BertModel.from_pretrained(pretrain_path) self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
Example #19
Source File: test_transformers.py From keras-onnx with MIT License | 5 votes |
def test_3layer_gpt2(self): from transformers import GPT2Config, TFGPT2Model, BertTokenizer keras2onnx.proto.keras.backend.set_learning_phase(0) config = GPT2Config(n_layer=3) model = TFGPT2Model(config) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='tf') predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
Example #20
Source File: modeling_tf_bert.py From exbert with Apache License 2.0 | 5 votes |
def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import BertTokenizer, TFBertForTokenClassification tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here return outputs # scores, (hidden_states), (attentions)
Example #21
Source File: modeling_tf_bert.py From exbert with Apache License 2.0 | 5 votes |
def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import BertTokenizer, TFBertForNextSentencePrediction tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.nsp(pooled_output) outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here return outputs # seq_relationship_score, (hidden_states), (attentions)
Example #22
Source File: modeling_tf_bert.py From exbert with Apache License 2.0 | 5 votes |
def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import BertTokenizer, TFBertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions)
Example #23
Source File: modeling_tf_bert.py From exbert with Apache License 2.0 | 5 votes |
def call(self, inputs, **kwargs): r""" Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import BertTokenizer, TFBertModel tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertModel.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.bert(inputs, **kwargs) return outputs
Example #24
Source File: BERT.py From sentence-transformers with Apache License 2.0 | 5 votes |
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(BERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 510: logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") max_seq_length = 510 self.max_seq_length = max_seq_length if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.bert = BertModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
Example #25
Source File: bertqa_sklearn.py From cdQA with Apache License 2.0 | 5 votes |
def __init__( self, bert_model="bert-base-uncased", do_lower_case=True, is_training=False, version_2_with_negative=False, max_seq_length=384, doc_stride=128, max_query_length=64, verbose=False, tokenizer=None, ): self.bert_model = bert_model self.do_lower_case = do_lower_case self.is_training = is_training self.version_2_with_negative = version_2_with_negative self.max_seq_length = max_seq_length self.doc_stride = doc_stride self.max_query_length = max_query_length self.verbose = verbose if tokenizer is None: self.tokenizer = BertTokenizer.from_pretrained( self.bert_model, do_lower_case=self.do_lower_case ) else: self.tokenizer = tokenizer logger.info("loading custom tokenizer")
Example #26
Source File: test_processor.py From cdQA with Apache License 2.0 | 5 votes |
def test_processor_functions(): download_squad(dir="./data") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) max_seq_length = 256 max_query_length = 32 doc_stride = 128 is_training = False verbose = False examples = read_squad_examples( "./data/SQuAD_1.1/dev-v1.1.json", is_training=is_training, version_2_with_negative=False, ) assert len(examples) == 10570 features = convert_examples_to_features( examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, verbose, ) assert len(features) == 12006
Example #27
Source File: bert_preprocessor.py From interpret-text with MIT License | 5 votes |
def get_tokenizer(self) -> BertTokenizer: """ Return bert tokenizer :return: BertTokenizer :rtype BertTokenizer """ return BertTokenizer.from_pretrained("bert-base-uncased")
Example #28
Source File: bert_encoder.py From OpenNRE with MIT License | 5 votes |
def __init__(self, max_length, pretrain_path, blank_padding=True, mask_entity=False): """ Args: max_length: max length of sentence pretrain_path: path of pretrain model """ super().__init__() self.max_length = max_length self.blank_padding = blank_padding self.hidden_size = 768 * 2 self.mask_entity = mask_entity logging.info('Loading BERT pre-trained checkpoint.') self.bert = BertModel.from_pretrained(pretrain_path) self.tokenizer = BertTokenizer.from_pretrained(pretrain_path) self.linear = nn.Linear(self.hidden_size, self.hidden_size)
Example #29
Source File: binarized_data.py From DistilKoBERT with Apache License 2.0 | 4 votes |
def main(): parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).") parser.add_argument('--file_path', type=str, default='data/dump.txt', help='The path to the data.') parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert']) parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased', help="The tokenizer to use.") parser.add_argument('--dump_file', type=str, default='data/dump', help='The dump file prefix.') args = parser.parse_args() logger.info(f'Loading Tokenizer ({args.tokenizer_name})') if args.tokenizer_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]` sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` elif args.tokenizer_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['cls_token'] # `<s>` sep = tokenizer.special_tokens_map['sep_token'] # `</s>` elif args.tokenizer_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>` sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>` elif args.tokenizer_type == 'kobert': tokenizer = KoBertTokenizer.from_pretrained('kobert') bos = tokenizer.special_tokens_map['cls_token'] sep = tokenizer.special_tokens_map['sep_token'] logger.info(f'Loading text from {args.file_path}') with open(args.file_path, 'r', encoding='utf8') as fp: data = fp.readlines() logger.info(f'Start encoding') logger.info(f'{len(data)} examples to process.') rslt = [] iter = 0 interval = 10000 start = time.time() for text in data: text = f'{bos} {text.strip()} {sep}' token_ids = tokenizer.encode(text, add_special_tokens=False) rslt.append(token_ids) iter += 1 if iter % interval == 0: end = time.time() logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl') start = time.time() logger.info('Finished binarization') logger.info(f'{len(data)} examples processed.') dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle' rslt_ = [np.uint16(d) for d in rslt] random.shuffle(rslt_) logger.info(f'Dump to {dp_file}') with open(dp_file, 'wb') as handle: pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #30
Source File: modeling_tf_bert.py From exbert with Apache License 2.0 | 4 votes |
def call(self, inputs, **kwargs): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import tensorflow as tf from transformers import BertTokenizer, TFBertForQuestionAnswering tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] """ outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) outputs = (start_logits, end_logits,) + outputs[2:] return outputs # start_logits, end_logits, (hidden_states), (attentions)