Python transformers.AutoTokenizer.from_pretrained() Examples
The following are 26
code examples of transformers.AutoTokenizer.from_pretrained().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
transformers.AutoTokenizer
, or try the search function
.
Example #1
Source File: field.py From flambe with MIT License | 7 votes |
def __init__(self, alias: str, cache_dir: Optional[str] = None, max_len_truncate: int = 500, add_special_tokens: bool = True, **kwargs) -> None: """Initialize a pretrained tokenizer. Parameters ---------- alias: str Alias of a pretrained tokenizer. cache_dir: str, optional A directory where to cache the downloaded vocabularies. max_len_truncate: int, default = 500 Truncates the length of the tokenized sequence. Because several pretrained models crash when this is > 500, it defaults to 500 add_special_tokens: bool, optional Add the special tokens to the inputs. Default ``True``. """ self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs) self.max_len_truncate = max_len_truncate self.add_special_tokens = add_special_tokens
Example #2
Source File: transformers.py From nboost with Apache License 2.0 | 6 votes |
def __init__(self, model_dir: str = 'nboost/pt-tinybert-msmarco', verbose: bool = defaults.verbose, max_seq_len: int = defaults.max_seq_len, **kwargs): super().__init__(**kwargs) self.logger = set_logger(model_dir, verbose=verbose) self.max_seq_len = max_seq_len self.logger.info('Loading from checkpoint %s' % model_dir) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self.device == torch.device("cpu"): self.logger.info("RUNNING ON CPU") else: self.logger.info("RUNNING ON CUDA") torch.cuda.synchronize(self.device) self.rerank_model = AutoModelForSequenceClassification.from_pretrained(model_dir) self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True) self.rerank_model.to(self.device, non_blocking=True)
Example #3
Source File: onnxbert.py From nboost with Apache License 2.0 | 6 votes |
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) sess_options = rt.SessionOptions() self.model_dir = glob.glob(os.path.join(self.model_dir, '*.onnx'))[0] # Set graph optimization level to ORT_ENABLE_EXTENDED to enable bert optimization. sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED # To enable model serialization and store the optimized graph to desired location. sess_options.optimized_model_filepath = self.model_dir self.session = rt.InferenceSession(self.model_dir, sess_options) if 'albert' in self.model_dir: self.tokenizer = AutoTokenizer.from_pretrained('albert-base-uncased') else: self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
Example #4
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_emotion = download_model('bert.emotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_emotion = os.path.join(path_emotion,'bert.emotion') path_reject = download_model('bert.noemotion', cache_dir, process_func=_unzip_process_func, verbose=verbose) path_reject = os.path.join(path_reject,'bert.noemotion') # load the models self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject) self.model_reject = BertForSequenceClassification.from_pretrained(path_reject) self.tokenizer = BertTokenizer.from_pretrained(path_emotion) self.model = BertForSequenceClassification.from_pretrained(path_emotion) # load the class names mapping self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese', 0: 'Glæde/Sindsro', 3: 'Overasket/Målløs', 1: 'Tillid/Accept', 4: 'Vrede/Irritation', 6: 'Sorg/trist', 7: 'Frygt/Bekymret'}
Example #5
Source File: test_perplexity_callback.py From catalyst with Apache License 2.0 | 6 votes |
def test_is_running(): """Test if perplexity is running normal""" tok = AutoTokenizer.from_pretrained("distilbert-base-uncased") model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased") dataset = LanguageModelingDataset(texts, tok) collate_fn = DataCollatorForLanguageModeling(tok).collate_batch dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn) optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) runner = HuggingFaceRunner() runner.train( model=model, optimizer=optimizer, loaders={"train": dataloader}, callbacks={ "optimizer": dl.OptimizerCallback(), "perplexity": PerplexityMetricCallback(), }, check=True, )
Example #6
Source File: prediction.py From fast-bert with Apache License 2.0 | 6 votes |
def __init__( self, model_path, label_path, multi_label=False, model_type="bert", use_fast_tokenizer=True, do_lower_case=True, ): self.model_path = model_path self.label_path = label_path self.multi_label = multi_label self.model_type = model_type self.do_lower_case = do_lower_case # Use auto-tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.model_path, use_fast=use_fast_tokenizer ) self.learner = self.get_learner()
Example #7
Source File: pipelines.py From exbert with Apache License 2.0 | 6 votes |
def get_defaults(self, model, tokenizer, framework): task_defaults = SUPPORTED_TASKS[self.task] if model is None: if framework == "tf": model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"]) elif framework == "pt": model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"]) else: raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.") if tokenizer is None: default_tokenizer = task_defaults["default"]["tokenizer"] if isinstance(default_tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1]) else: tokenizer = AutoTokenizer.from_pretrained(default_tokenizer) return model, tokenizer
Example #8
Source File: question_answering.py From nlp-recipes with MIT License | 6 votes |
def __init__( self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".", ): self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained( model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False, ) self.do_lower_case = to_lower self.custom_tokenize = custom_tokenize
Example #9
Source File: preprocess.py From unilm with MIT License | 6 votes |
def seg(args): tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=True ) seg_file( os.path.join(args.output_dir, args.data_split + ".txt.tmp"), tokenizer, args.max_len, ) seg_file( os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"), tokenizer, args.max_len, ) seg_file( os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"), tokenizer, args.max_len, )
Example #10
Source File: transfo_experiment.py From axcell with Apache License 2.0 | 5 votes |
def tokenizer(self): if self._tokenizer is None: self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name) return self._tokenizer
Example #11
Source File: transfo_experiment.py From axcell with Apache License 2.0 | 5 votes |
def train_model(self, data: TransfoDatabunch): self.set_seed("class") self.train_started = time.time() num_labels = data.num_labels config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config) train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer) model.to("cpu") return model
Example #12
Source File: download.py From exbert with Apache License 2.0 | 5 votes |
def run(self): from transformers import AutoModel, AutoTokenizer AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
Example #13
Source File: question_answering.py From nlp-recipes with MIT License | 5 votes |
def __init__( self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None ): model = MODEL_CLASS[model_name].from_pretrained( model_name if load_model_from_dir is None else load_model_from_dir, cache_dir=cache_dir, output_loading_info=False, ) super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
Example #14
Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0 | 5 votes |
def test_tokenizer_tokenizer(): """Test initialization with tokenizer""" tok = AutoTokenizer.from_pretrained("bert-base-uncased") dataset = LanguageModelingDataset(texts, tok) assert dataset[0] is not None assert len(dataset) == 2
Example #15
Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0 | 5 votes |
def test_exception_with_sort(): """Test lazy=True sort=True case""" tok = AutoTokenizer.from_pretrained("bert-base-uncased") dataset = LanguageModelingDataset( # noqa: F841 texts, tok, lazy=True, sort=True )
Example #16
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import AutoModelForTokenClassification from transformers import AutoTokenizer # download the model or load the model path weights_path = download_model('bert.ner', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] self.model = AutoModelForTokenClassification.from_pretrained(weights_path) self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
Example #17
Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertTokenizer, BertForSequenceClassification # download the model or load the model path path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose) path_sub = os.path.join(path_sub,'bert.sub.v0.0.1') path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func,verbose=verbose) path_pol = os.path.join(path_pol,'bert.pol.v0.0.1') self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub) self.model_sub = BertForSequenceClassification.from_pretrained(path_sub) self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol) self.model_pol = BertForSequenceClassification.from_pretrained(path_pol)
Example #18
Source File: Transformer.py From sentence-transformers with Apache License 2.0 | 5 votes |
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ): super(Transformer, self).__init__() self.config_keys = ['max_seq_length'] self.max_seq_length = max_seq_length config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
Example #19
Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License | 4 votes |
def __init__( self, processor, model_name="bert-base-uncased", finetune_bert=True, cache_dir=".", label_smoothing=0.1, test=False, max_pos_length=768, ): """Initialize an object of BertSumAbs. Args: processor (BertSumAbsProcessor): A processor with symbols, tokenizers and collate functions that are used in finetuning and prediction. model_name (str, optional:) Name of the pretrained model which is used to initialize the encoder of the BertSumAbs model. check MODEL_CLASS for supported models. Defaults to "bert-base-uncased". finetune_bert (bool, option): Whether the bert model in the encoder is finetune or not. Defaults to True. cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". label_smoothing (float, optional): The amount of label smoothing. Value range is [0, 1]. Defaults to 0.1. test (bool, optional): Whether the class is initiated for test or not. It must be True if the class obj is only initialized to load a checkpoint for test/inferencing. Defaults to False. max_pos_length (int, optional): maximum postional embedding length for the input. Defaults to 768. """ model = MODEL_CLASS[model_name].from_pretrained( model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False ) super().__init__(model_name=model_name, model=model, cache_dir=cache_dir) if model_name not in self.list_supported_models(): raise ValueError( "Model name {} is not supported by BertSumAbs. " "Call 'BertSumAbs.list_supported_models()' to get all supported model " "names.".format(value) ) self.model_class = MODEL_CLASS[model_name] self.cache_dir = cache_dir self.max_pos_length = max_pos_length self.model = AbsSummarizer( temp_dir=cache_dir, finetune_bert=finetune_bert, checkpoint=None, label_smoothing=label_smoothing, symbols=processor.symbols, test=test, max_pos=self.max_pos_length, ) self.processor = processor self.optim_bert = None self.optim_dec = None
Example #20
Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License | 4 votes |
def __init__( self, model_name="bert-base-uncased", to_lower=True, cache_dir=".", max_src_len=640, max_tgt_len=140, ): """ Initialize the preprocessor. Args: model_name (str, optional): Transformer model name used in preprocessing. check MODEL_CLASS for supported models. Defaults to "bert-base-cased". to_lower (bool, optional): Whether to convert all letters to lower case during tokenization. This is determined by if a cased model is used. Defaults to True, which corresponds to a uncased model. cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". max_src_len (int, optional): Max number of tokens that be used as input. Defaults to 640. max_tgt_len (int, optional): Max number of tokens that be used as in target. Defaults to 140. """ self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained( model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False, ) self.symbols = { "BOS": self.tokenizer.vocab["[unused0]"], "EOS": self.tokenizer.vocab["[unused1]"], "PAD": self.tokenizer.vocab["[PAD]"], "EOQ": self.tokenizer.vocab["[unused2]"], } self.sep_token = "[SEP]" self.cls_token = "[CLS]" self.pad_token = "[PAD]" self.tgt_bos = self.symbols["BOS"] self.tgt_eos = self.symbols["EOS"] self.max_src_len = max_src_len self.max_tgt_len = max_tgt_len
Example #21
Source File: extractive_summarization.py From nlp-recipes with MIT License | 4 votes |
def __init__( self, model_name="distilbert-base-uncased", to_lower=False, cache_dir=".", max_nsents=200, max_src_ntokens=2000, min_nsents=3, min_src_ntokens=5, ): """ Initialize the preprocessor. Args: model_name (str, optional): Transformer model name used in preprocessing. check MODEL_CLASS for supported models. Defaults to "bert-base-cased". to_lower (bool, optional): Whether to convert all letters to lower case during tokenization. This is determined by if a cased model is used. Defaults to False, which corresponds to a cased model. cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".". max_nsents (int, optional): Max number of sentences that can be used as input. Defaults to 200. max_src_ntokens (int, optional): Max number of tokens that be used as input. Defaults to 2000. min_nsents (int, optional): Minimum number of sentences that are required as input. If the input has less number of sentences than this value, it's skipped and cannot be used as a valid input. Defaults to 3. min_src_ntokens (int, optional): Minimum number of tokens that are required as an input sentence.If the input sentence has less number of tokens than this value, it's skipped and cannot be used as a valid sentence. Defaults to 5. """ self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained( model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False, ) self.sep_vid = self.tokenizer.vocab["[SEP]"] self.cls_vid = self.tokenizer.vocab["[CLS]"] self.pad_vid = self.tokenizer.vocab["[PAD]"] self.max_nsents = max_nsents self.max_src_ntokens = max_src_ntokens self.min_nsents = min_nsents self.min_src_ntokens = min_src_ntokens
Example #22
Source File: language_modeling.py From catalyst with Apache License 2.0 | 4 votes |
def __init__( self, texts: Iterable[str], tokenizer: Union[str, PreTrainedTokenizer], max_seq_length: int = None, sort: bool = True, lazy: bool = False, ): """ Args: texts (Iterable): Iterable object with text tokenizer (str or tokenizer): pre trained huggingface tokenizer or model name max_seq_length (int): max sequence length to tokenize sort (bool): If True then sort all sequences by length for efficient padding lazy (bool): If True then tokenize and encode sequence in __getitem__ method else will tokenize in __init__ also if set to true sorting is unavialible """ if sort and lazy: raise Exception( "lazy is set to True so we can't sort" " sequences by length.\n" "You should set sort=False and lazy=True" " if you want to encode text in __get_item__ function" ) if isinstance(tokenizer, str): self.tokenizer = AutoTokenizer.from_pretrained(tokenizer) elif isinstance( tokenizer, transformers.tokenization_utils.PreTrainedTokenizer ): self.tokenizer = tokenizer else: raise TypeError( "tokenizer argument should be a model name" + " or huggingface PreTrainedTokenizer" ) self.max_seq_length = max_seq_length self.lazy = lazy if lazy: self.texts = texts if not lazy: pbar = tqdm(texts, desc="tokenizing texts") self.encoded = [ self.tokenizer.encode(text, max_length=max_seq_length) for text in pbar ] if sort: self.encoded.sort(key=len) self.length = len(texts) self._getitem_fn = ( self._getitem_lazy if lazy else self._getitem_encoded )
Example #23
Source File: text_classification.py From catalyst with Apache License 2.0 | 4 votes |
def __init__( self, texts: List[str], labels: List[str] = None, label_dict: Mapping[str, int] = None, max_seq_length: int = 512, model_name: str = "distilbert-base-uncased", ): """ Args: texts (List[str]): a list with texts to classify or to train the classifier on labels List[str]: a list with classification labels (optional) label_dict (dict): a dictionary mapping class names to class ids, to be passed to the validation data (optional) max_seq_length (int): maximal sequence length in tokens, texts will be stripped to this length model_name (str): transformer model name, needed to perform appropriate tokenization """ self.texts = texts self.labels = labels self.label_dict = label_dict self.max_seq_length = max_seq_length if self.label_dict is None and labels is not None: # {'class1': 0, 'class2': 1, 'class3': 2, ...} # using this instead of `sklearn.preprocessing.LabelEncoder` # no easily handle unknown target values self.label_dict = dict( zip(sorted(set(labels)), range(len(set(labels)))) ) self.tokenizer = AutoTokenizer.from_pretrained(model_name) # suppresses tokenizer warnings logging.getLogger("transformers.tokenization_utils").setLevel( logging.FATAL ) # special tokens for transformers # in the simplest case a [CLS] token is added in the beginning # and [SEP] token is added in the end of a piece of text # [CLS] <indexes text tokens> [SEP] .. <[PAD]> self.sep_vid = self.tokenizer.vocab["[SEP]"] self.cls_vid = self.tokenizer.vocab["[CLS]"] self.pad_vid = self.tokenizer.vocab["[PAD]"]
Example #24
Source File: benchmarks.py From exbert with Apache License 2.0 | 4 votes |
def _compute_tensorflow(model_names, dictionary, average_over, amp): for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") config = AutoConfig.from_pretrained(model_name) model = TFAutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] batch_sizes = [1, 2, 4, 8] slice_sizes = [8, 64, 128, 256, 512, 1024] dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}} dictionary[model_name]["results"] = {i: {} for i in batch_sizes} print("Using model", model) @tf.function def inference(inputs): return model(inputs) for batch_size in batch_sizes: for slice_size in slice_sizes: if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: sequence = tf.stack( [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size ) try: print("Going through model with sequence of shape", sequence.shape) # To make sure that the model is traced + that the tensors are on the appropriate device inference(sequence) runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3) average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][slice_size] = average_time except tf.errors.ResourceExhaustedError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][slice_size] = "N/A" return dictionary
Example #25
Source File: benchmarks.py From exbert with Apache License 2.0 | 4 votes |
def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16): for c, model_name in enumerate(model_names): print(f"{c + 1} / {len(model_names)}") config = AutoConfig.from_pretrained(model_name, torchscript=torchscript) model = AutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] batch_sizes = [1, 2, 4, 8] slice_sizes = [8, 64, 128, 256, 512, 1024] dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}} dictionary[model_name]["results"] = {i: {} for i in batch_sizes} for batch_size in batch_sizes: if fp16: model.half() model.to(device) model.eval() for slice_size in slice_sizes: if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1) try: if torchscript: print("Tracing model with sequence size", sequence.shape) inference = torch.jit.trace(model, sequence) inference(sequence) else: inference = model inference(sequence) print("Going through model with sequence of shape", sequence.shape) runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3) average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][slice_size] = average_time except RuntimeError as e: print("Doesn't fit on GPU.", e) torch.cuda.empty_cache() dictionary[model_name]["results"][batch_size][slice_size] = "N/A" return dictionary
Example #26
Source File: wordpiece_indexer.py From NLP_Toolkit with Apache License 2.0 | 4 votes |
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, never_lowercase: List[str] = None, max_pieces: int = 512, max_pieces_per_token=5, is_test=False, truncate_long_sequences: bool = True, special_tokens_fix: int = 0) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning("Your BERT model appears to be cased, " "but your indexer is lowercasing tokens.") elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") bert_tokenizer = AutoTokenizer.from_pretrained( pretrained_model, do_lower_case=do_lowercase, do_basic_tokenize=False) # to adjust all tokenizers if hasattr(bert_tokenizer, 'encoder'): bert_tokenizer.vocab = bert_tokenizer.encoder if hasattr(bert_tokenizer, 'sp_model'): bert_tokenizer.vocab = defaultdict(lambda: 1) for i in range(bert_tokenizer.sp_model.get_piece_size()): bert_tokenizer.vocab[bert_tokenizer.sp_model.id_to_piece(i)] = i if special_tokens_fix: bert_tokenizer.add_tokens([START_TOKEN]) bert_tokenizer.vocab[START_TOKEN] = len(bert_tokenizer) - 1 if "roberta" in pretrained_model: bpe_ranks = bert_tokenizer.bpe_ranks byte_encoder = bert_tokenizer.byte_encoder else: bpe_ranks = {} byte_encoder = None super().__init__(vocab=bert_tokenizer.vocab, bpe_ranks=bpe_ranks, byte_encoder=byte_encoder, wordpiece_tokenizer=bert_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, max_pieces_per_token=max_pieces_per_token, is_test=is_test, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=["[CLS]"] if not special_tokens_fix else [], end_tokens=["[SEP]"] if not special_tokens_fix else [], truncate_long_sequences=truncate_long_sequences)