Python transformers.PreTrainedTokenizer() Examples

The following are 7 code examples of transformers.PreTrainedTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module transformers , or try the search function .
Example #1
Source File: bert.py    From nyaggle with MIT License 5 votes vote down vote up
def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
                 text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
                 use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
                 model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
        if tokenizer is not None:
            assert model is not None
            self.tokenizer = tokenizer
            self.model = model
        if lang == 'en':
            self.tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
            self.model = transformers.BertModel.from_pretrained('bert-base-uncased')
        elif lang == 'jp':
            self.tokenizer = transformers.BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')
            self.model = transformers.BertModel.from_pretrained('bert-base-japanese-whole-word-masking')
        else:
            raise ValueError('Specified language type () is invalid.'.format(lang))

        self.lang = lang
        self.n_components = n_components
        self.text_columns = text_columns
        self.pooling_strategy = pooling_strategy
        self.use_cuda = use_cuda
        self.return_same_type = return_same_type
        self.svd = {}
        self.column_format = column_format 
Example #2
Source File: pretrained_transformer_tokenizer.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def tokenizer_lowercases(tokenizer: PreTrainedTokenizer) -> bool:
        # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it
        # this way seems like the least brittle way to do it.
        tokenized = tokenizer.tokenize(
            "A"
        )  # Use a single character that won't be cut into word pieces.
        detokenized = " ".join(tokenized)
        return "a" in detokenized 
Example #3
Source File: cached_transformers.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def get_tokenizer(model_name: str, **kwargs) -> transformers.PreTrainedTokenizer:
    cache_key = (model_name, frozenset(kwargs.items()))

    global _tokenizer_cache
    tokenizer = _tokenizer_cache.get(cache_key, None)
    if tokenizer is None:
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **kwargs)
        _tokenizer_cache[cache_key] = tokenizer
    return tokenizer 
Example #4
Source File: dataset.py    From gpt-2-output-dataset with MIT License 5 votes vote down vote up
def __init__(self, real_texts: List[str], fake_texts: List[str], tokenizer: PreTrainedTokenizer,
                 max_sequence_length: int = None, min_sequence_length: int = None, epoch_size: int = None,
                 token_dropout: float = None, seed: int = None):
        self.real_texts = real_texts
        self.fake_texts = fake_texts
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length
        self.min_sequence_length = min_sequence_length
        self.epoch_size = epoch_size
        self.token_dropout = token_dropout
        self.random = np.random.RandomState(seed) 
Example #5
Source File: language_modeling_utils.py    From simpletransformers with Apache License 2.0 5 votes vote down vote up
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling."
            "Set 'mlm' to False in args if you want to use this tokenizer."
        )

    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training
    # (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    if args.model_type == "electra" and False:
        # For ELECTRA, we replace all masked input tokens with tokenizer.mask_token
        inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    else:
        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels 
Example #6
Source File: test_tokenization_utils.py    From exbert with Apache License 2.0 5 votes vote down vote up
def check_tokenizer_from_pretrained(self, tokenizer_class):
        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
        for model_name in s3_models[:1]:
            tokenizer = tokenizer_class.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, tokenizer_class)
            self.assertIsInstance(tokenizer, PreTrainedTokenizer)

            for special_tok in tokenizer.all_special_tokens:
                self.assertIsInstance(special_tok, str)
                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
                self.assertIsInstance(special_tok_id, int) 
Example #7
Source File: language_modeling.py    From catalyst with Apache License 2.0 4 votes vote down vote up
def __init__(
        self,
        texts: Iterable[str],
        tokenizer: Union[str, PreTrainedTokenizer],
        max_seq_length: int = None,
        sort: bool = True,
        lazy: bool = False,
    ):
        """
        Args:
            texts (Iterable): Iterable object with text
            tokenizer (str or tokenizer): pre trained
                huggingface tokenizer or model name
            max_seq_length (int): max sequence length to tokenize
            sort (bool): If True then sort all sequences by length
                for efficient padding
            lazy (bool): If True then tokenize and encode sequence
                in __getitem__ method
                else will tokenize in __init__ also
                if set to true sorting is unavialible
        """
        if sort and lazy:
            raise Exception(
                "lazy is set to True so we can't sort"
                " sequences by length.\n"
                "You should set sort=False and lazy=True"
                " if you want to encode text in __get_item__ function"
            )
        if isinstance(tokenizer, str):
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif isinstance(
            tokenizer, transformers.tokenization_utils.PreTrainedTokenizer
        ):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                "tokenizer argument should be a model name"
                + " or huggingface PreTrainedTokenizer"
            )

        self.max_seq_length = max_seq_length

        self.lazy = lazy

        if lazy:
            self.texts = texts

        if not lazy:
            pbar = tqdm(texts, desc="tokenizing texts")
            self.encoded = [
                self.tokenizer.encode(text, max_length=max_seq_length)
                for text in pbar
            ]
            if sort:
                self.encoded.sort(key=len)

        self.length = len(texts)

        self._getitem_fn = (
            self._getitem_lazy if lazy else self._getitem_encoded
        )