Python Examples of transformers.AutoTokenizer.from

Source File: field.py From flambe with MIT License

7 votes

def __init__(self,
                 alias: str,
                 cache_dir: Optional[str] = None,
                 max_len_truncate: int = 500,
                 add_special_tokens: bool = True, **kwargs) -> None:
        """Initialize a pretrained tokenizer.

        Parameters
        ----------
        alias: str
            Alias of a pretrained tokenizer.
        cache_dir: str, optional
            A directory where to cache the downloaded vocabularies.
        max_len_truncate: int, default = 500
            Truncates the length of the tokenized sequence.
            Because several pretrained models crash when this is
            > 500, it defaults to 500
        add_special_tokens: bool, optional
            Add the special tokens to the inputs. Default ``True``.

        """
        self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs)
        self.max_len_truncate = max_len_truncate
        self.add_special_tokens = add_special_tokens

Source File: transformers.py From nboost with Apache License 2.0

6 votes

def __init__(self,
                 model_dir: str = 'nboost/pt-tinybert-msmarco',
                 verbose: bool = defaults.verbose,
                 max_seq_len: int = defaults.max_seq_len,
                 **kwargs):
        super().__init__(**kwargs)
        self.logger = set_logger(model_dir, verbose=verbose)
        self.max_seq_len = max_seq_len

        self.logger.info('Loading from checkpoint %s' % model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if self.device == torch.device("cpu"):
            self.logger.info("RUNNING ON CPU")
        else:
            self.logger.info("RUNNING ON CUDA")
            torch.cuda.synchronize(self.device)

        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

        self.rerank_model.to(self.device, non_blocking=True)

Source File: onnxbert.py From nboost with Apache License 2.0

6 votes

def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        sess_options = rt.SessionOptions()

        self.model_dir = glob.glob(os.path.join(self.model_dir, '*.onnx'))[0]

        # Set graph optimization level to ORT_ENABLE_EXTENDED to enable bert optimization.
        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

        # To enable model serialization and store the optimized graph to desired location.
        sess_options.optimized_model_filepath = self.model_dir
        self.session = rt.InferenceSession(self.model_dir, sess_options)
        if 'albert' in self.model_dir:
            self.tokenizer = AutoTokenizer.from_pretrained('albert-base-uncased')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import BertTokenizer, BertForSequenceClassification

        # download the model or load the model path
        path_emotion = download_model('bert.emotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_emotion = os.path.join(path_emotion,'bert.emotion')
        path_reject = download_model('bert.noemotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_reject = os.path.join(path_reject,'bert.noemotion')
        # load the models
        self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject)
        self.model_reject = BertForSequenceClassification.from_pretrained(path_reject)
        
        self.tokenizer = BertTokenizer.from_pretrained(path_emotion)
        self.model = BertForSequenceClassification.from_pretrained(path_emotion)
        
        # load the class names mapping
        self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese',
                           0: 'Glæde/Sindsro', 3: 'Overasket/Målløs',
                           1: 'Tillid/Accept',
                           4: 'Vrede/Irritation', 6: 'Sorg/trist',
                           7: 'Frygt/Bekymret'}

Source File: test_perplexity_callback.py From catalyst with Apache License 2.0

6 votes

def test_is_running():
    """Test if perplexity is running normal"""
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    collate_fn = DataCollatorForLanguageModeling(tok).collate_batch
    dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    runner = HuggingFaceRunner()
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders={"train": dataloader},
        callbacks={
            "optimizer": dl.OptimizerCallback(),
            "perplexity": PerplexityMetricCallback(),
        },
        check=True,
    )

Source File: prediction.py From fast-bert with Apache License 2.0

6 votes

def __init__(
        self,
        model_path,
        label_path,
        multi_label=False,
        model_type="bert",
        use_fast_tokenizer=True,
        do_lower_case=True,
    ):
        self.model_path = model_path
        self.label_path = label_path
        self.multi_label = multi_label
        self.model_type = model_type
        self.do_lower_case = do_lower_case

        # Use auto-tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path, use_fast=use_fast_tokenizer
        )

        self.learner = self.get_learner()

Source File: pipelines.py From exbert with Apache License 2.0

6 votes

def get_defaults(self, model, tokenizer, framework):
        task_defaults = SUPPORTED_TASKS[self.task]
        if model is None:
            if framework == "tf":
                model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
            elif framework == "pt":
                model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
            else:
                raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")

        if tokenizer is None:
            default_tokenizer = task_defaults["default"]["tokenizer"]
            if isinstance(default_tokenizer, tuple):
                # For tuple we have (tokenizer name, {kwargs})
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
            else:
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)

        return model, tokenizer

Source File: question_answering.py From nlp-recipes with MIT License

6 votes

def __init__(
        self,
        model_name="bert-base-cased",
        to_lower=False,
        custom_tokenize=None,
        cache_dir=".",
    ):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize

Source File: preprocess.py From unilm with MIT License

6 votes

def seg(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, do_lower_case=True
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + ".txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"),
        tokenizer,
        args.max_len,
    )

Source File: transfo_experiment.py From axcell with Apache License 2.0

5 votes

def tokenizer(self):
        if self._tokenizer is None:
            self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name)
        return self._tokenizer

Source File: transfo_experiment.py From axcell with Apache License 2.0

5 votes

def train_model(self, data: TransfoDatabunch):
        self.set_seed("class")
        self.train_started = time.time()
        num_labels = data.num_labels
        config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name
        model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config)
        train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer)
        model.to("cpu")
        return model

Source File: download.py From exbert with Apache License 2.0

5 votes

def run(self):
        from transformers import AutoModel, AutoTokenizer

        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)

Source File: question_answering.py From nlp-recipes with MIT License

5 votes

def __init__(
        self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None
    ):
        model = MODEL_CLASS[model_name].from_pretrained(
            model_name if load_model_from_dir is None else load_model_from_dir,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0

5 votes

def test_tokenizer_tokenizer():
    """Test initialization with tokenizer"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    assert dataset[0] is not None
    assert len(dataset) == 2

Source File: test_language_modeling_dataset.py From catalyst with Apache License 2.0

5 votes

def test_exception_with_sort():
    """Test lazy=True sort=True case"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(  # noqa: F841
        texts, tok, lazy=True, sort=True
    )

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import AutoModelForTokenClassification
        from transformers import AutoTokenizer

        # download the model or load the model path
        weights_path = download_model('bert.ner', cache_dir,
                                      process_func=_unzip_process_func,
                                      verbose=verbose)

        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
                           "I-ORG", "B-LOC", "I-LOC"]

        self.model = AutoModelForTokenClassification.from_pretrained(weights_path)
        self.tokenizer = AutoTokenizer.from_pretrained(weights_path)

Source File: bert_models.py From danlp with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import BertTokenizer, BertForSequenceClassification
         
        
        # download the model or load the model path
        path_sub = download_model('bert.subjective', cache_dir, process_func=_unzip_process_func,verbose=verbose)
        path_sub = os.path.join(path_sub,'bert.sub.v0.0.1')
        path_pol = download_model('bert.polarity', cache_dir, process_func=_unzip_process_func,verbose=verbose)
        path_pol = os.path.join(path_pol,'bert.pol.v0.0.1')
        
        self.tokenizer_sub = BertTokenizer.from_pretrained(path_sub)
        self.model_sub = BertForSequenceClassification.from_pretrained(path_sub)
        self.tokenizer_pol = BertTokenizer.from_pretrained(path_pol)
        self.model_pol = BertForSequenceClassification.from_pretrained(path_pol)

Source File: Transformer.py From sentence-transformers with Apache License 2.0

5 votes

def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
        super(Transformer, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
        self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)

Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        processor,
        model_name="bert-base-uncased",
        finetune_bert=True,
        cache_dir=".",
        label_smoothing=0.1,
        test=False,
        max_pos_length=768,
    ):
        """Initialize an object of BertSumAbs.

        Args:
            processor (BertSumAbsProcessor): A processor with symbols, tokenizers
                and collate functions that are used in finetuning and prediction.
            model_name (str, optional:) Name of the pretrained model which is used
                to initialize the encoder of the BertSumAbs model.
                check MODEL_CLASS for supported models. Defaults to "bert-base-uncased".
            finetune_bert (bool, option): Whether the bert model in the encoder is
                finetune or not. Defaults to True.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            label_smoothing (float, optional): The amount of label smoothing.
                Value range is [0, 1]. Defaults to 0.1.
            test (bool, optional): Whether the class is initiated for test or not.
                It must be True if the class obj is only initialized to load a
                 checkpoint for test/inferencing.  Defaults to False.
            max_pos_length (int, optional): maximum postional embedding length for the
                input. Defaults to 768.
        """
        model = MODEL_CLASS[model_name].from_pretrained(
            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by BertSumAbs. "
                "Call 'BertSumAbs.list_supported_models()' to get all supported model "
                "names.".format(value)
            )

        self.model_class = MODEL_CLASS[model_name]
        self.cache_dir = cache_dir
        self.max_pos_length = max_pos_length

        self.model = AbsSummarizer(
            temp_dir=cache_dir,
            finetune_bert=finetune_bert,
            checkpoint=None,
            label_smoothing=label_smoothing,
            symbols=processor.symbols,
            test=test,
            max_pos=self.max_pos_length,
        )
        self.processor = processor
        self.optim_bert = None
        self.optim_dec = None

Source File: abstractive_summarization_bertsum.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        model_name="bert-base-uncased",
        to_lower=True,
        cache_dir=".",
        max_src_len=640,
        max_tgt_len=140,
    ):
        """ Initialize the preprocessor.

        Args:
            model_name (str, optional): Transformer model name used in preprocessing.
                check MODEL_CLASS for supported models. Defaults to "bert-base-cased".
            to_lower (bool, optional): Whether to convert all letters to lower case
                during tokenization. This is determined by if a cased model is used.
                Defaults to True, which corresponds to a uncased model.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            max_src_len (int, optional): Max number of tokens that be used
                as input. Defaults to 640.
            max_tgt_len (int, optional): Max number of tokens that be used
                as in target. Defaults to 140.

        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )

        self.symbols = {
            "BOS": self.tokenizer.vocab["[unused0]"],
            "EOS": self.tokenizer.vocab["[unused1]"],
            "PAD": self.tokenizer.vocab["[PAD]"],
            "EOQ": self.tokenizer.vocab["[unused2]"],
        }

        self.sep_token = "[SEP]"
        self.cls_token = "[CLS]"
        self.pad_token = "[PAD]"
        self.tgt_bos = self.symbols["BOS"]
        self.tgt_eos = self.symbols["EOS"]

        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len

Source File: extractive_summarization.py From nlp-recipes with MIT License

4 votes

def __init__(
        self,
        model_name="distilbert-base-uncased",
        to_lower=False,
        cache_dir=".",
        max_nsents=200,
        max_src_ntokens=2000,
        min_nsents=3,
        min_src_ntokens=5,
    ):
        """ Initialize the preprocessor.

        Args:
            model_name (str, optional): Transformer model name used in preprocessing.
                check MODEL_CLASS for supported models. Defaults to "bert-base-cased".
            to_lower (bool, optional): Whether to convert all letters to lower case
                during tokenization. This is determined by if a cased model is used.
                Defaults to False, which corresponds to a cased model.
            cache_dir (str, optional): Directory to cache the tokenizer.
                Defaults to ".".
            max_nsents (int, optional): Max number of sentences that can be used
                as input. Defaults to 200.
            max_src_ntokens (int, optional): Max number of tokens that be used
                as input. Defaults to 2000.
            min_nsents (int, optional): Minimum number of sentences that are required
                as input. If the input has less number of sentences than this value,
                it's skipped and cannot be used as a valid input. Defaults to 3.
            min_src_ntokens (int, optional): Minimum number of tokens that are required
                as an input sentence.If the input sentence has less number of tokens
                than this value, it's skipped and cannot be used as a valid sentence.
                Defaults to 5.

        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

        self.max_nsents = max_nsents
        self.max_src_ntokens = max_src_ntokens
        self.min_nsents = min_nsents
        self.min_src_ntokens = min_src_ntokens

Source File: language_modeling.py From catalyst with Apache License 2.0

4 votes

def __init__(
        self,
        texts: Iterable[str],
        tokenizer: Union[str, PreTrainedTokenizer],
        max_seq_length: int = None,
        sort: bool = True,
        lazy: bool = False,
    ):
        """
        Args:
            texts (Iterable): Iterable object with text
            tokenizer (str or tokenizer): pre trained
                huggingface tokenizer or model name
            max_seq_length (int): max sequence length to tokenize
            sort (bool): If True then sort all sequences by length
                for efficient padding
            lazy (bool): If True then tokenize and encode sequence
                in __getitem__ method
                else will tokenize in __init__ also
                if set to true sorting is unavialible
        """
        if sort and lazy:
            raise Exception(
                "lazy is set to True so we can't sort"
                " sequences by length.\n"
                "You should set sort=False and lazy=True"
                " if you want to encode text in __get_item__ function"
            )
        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif isinstance(
            tokenizer, transformers.tokenization_utils.PreTrainedTokenizer
        ):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                "tokenizer argument should be a model name"
                + " or huggingface PreTrainedTokenizer"
            )

        self.max_seq_length = max_seq_length

        self.lazy = lazy

        if lazy:
            self.texts = texts

        if not lazy:
            pbar = tqdm(texts, desc="tokenizing texts")
            self.encoded = [
                self.tokenizer.encode(text, max_length=max_seq_length)
                for text in pbar
            ]
            if sort:
                self.encoded.sort(key=len)

        self.length = len(texts)

        self._getitem_fn = (
            self._getitem_lazy if lazy else self._getitem_encoded
        )

Source File: text_classification.py From catalyst with Apache License 2.0

4 votes

def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization
        """
        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(
                zip(sorted(set(labels)), range(len(set(labels))))
            )

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(
            logging.FATAL
        )

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

Source File: benchmarks.py From exbert with Apache License 2.0

4 votes

def _compute_tensorflow(model_names, dictionary, average_over, amp):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name)
        model = TFAutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        print("Using model", model)

        @tf.function
        def inference(inputs):
            return model(inputs)

        for batch_size in batch_sizes:
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = tf.stack(
                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
                    )

                    try:
                        print("Going through model with sequence of shape", sequence.shape)
                        # To make sure that the model is traced + that the tensors are on the appropriate device
                        inference(sequence)

                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except tf.errors.ResourceExhaustedError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary

Source File: benchmarks.py From exbert with Apache License 2.0

4 votes

def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            inference(sequence)

                        print("Going through model with sequence of shape", sequence.shape)
                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary

Source File: wordpiece_indexer.py From NLP_Toolkit with Apache License 2.0

4 votes

def __init__(self,
                 pretrained_model: str,
                 use_starting_offsets: bool = False,
                 do_lowercase: bool = True,
                 never_lowercase: List[str] = None,
                 max_pieces: int = 512,
                 max_pieces_per_token=5,
                 is_test=False,
                 truncate_long_sequences: bool = True,
                 special_tokens_fix: int = 0) -> None:
        if pretrained_model.endswith("-cased") and do_lowercase:
            logger.warning("Your BERT model appears to be cased, "
                           "but your indexer is lowercasing tokens.")
        elif pretrained_model.endswith("-uncased") and not do_lowercase:
            logger.warning("Your BERT model appears to be uncased, "
                           "but your indexer is not lowercasing tokens.")

        bert_tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model, do_lower_case=do_lowercase, do_basic_tokenize=False)

        # to adjust all tokenizers
        if hasattr(bert_tokenizer, 'encoder'):
            bert_tokenizer.vocab = bert_tokenizer.encoder
        if hasattr(bert_tokenizer, 'sp_model'):
            bert_tokenizer.vocab = defaultdict(lambda: 1)
            for i in range(bert_tokenizer.sp_model.get_piece_size()):
                bert_tokenizer.vocab[bert_tokenizer.sp_model.id_to_piece(i)] = i

        if special_tokens_fix:
            bert_tokenizer.add_tokens([START_TOKEN])
            bert_tokenizer.vocab[START_TOKEN] = len(bert_tokenizer) - 1

        if "roberta" in pretrained_model:
            bpe_ranks = bert_tokenizer.bpe_ranks
            byte_encoder = bert_tokenizer.byte_encoder
        else:
            bpe_ranks = {}
            byte_encoder = None

        super().__init__(vocab=bert_tokenizer.vocab,
                         bpe_ranks=bpe_ranks,
                         byte_encoder=byte_encoder,
                         wordpiece_tokenizer=bert_tokenizer.tokenize,
                         namespace="bert",
                         use_starting_offsets=use_starting_offsets,
                         max_pieces=max_pieces,
                         max_pieces_per_token=max_pieces_per_token,
                         is_test=is_test,
                         do_lowercase=do_lowercase,
                         never_lowercase=never_lowercase,
                         start_tokens=["[CLS]"] if not special_tokens_fix else [],
                         end_tokens=["[SEP]"] if not special_tokens_fix else [],
                         truncate_long_sequences=truncate_long_sequences)

Python transformers.AutoTokenizer.from_pretrained() Examples