Python transformers.PreTrainedTokenizer() Examples
The following are 7
code examples of transformers.PreTrainedTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
transformers
, or try the search function
.
Example #1
Source File: bert.py From nyaggle with MIT License | 5 votes |
def __init__(self, lang: str = 'en', n_components: Optional[int] = None, text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean', use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None, model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'): if tokenizer is not None: assert model is not None self.tokenizer = tokenizer self.model = model if lang == 'en': self.tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased') self.model = transformers.BertModel.from_pretrained('bert-base-uncased') elif lang == 'jp': self.tokenizer = transformers.BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking') self.model = transformers.BertModel.from_pretrained('bert-base-japanese-whole-word-masking') else: raise ValueError('Specified language type () is invalid.'.format(lang)) self.lang = lang self.n_components = n_components self.text_columns = text_columns self.pooling_strategy = pooling_strategy self.use_cuda = use_cuda self.return_same_type = return_same_type self.svd = {} self.column_format = column_format
Example #2
Source File: pretrained_transformer_tokenizer.py From allennlp with Apache License 2.0 | 5 votes |
def tokenizer_lowercases(tokenizer: PreTrainedTokenizer) -> bool: # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it # this way seems like the least brittle way to do it. tokenized = tokenizer.tokenize( "A" ) # Use a single character that won't be cut into word pieces. detokenized = " ".join(tokenized) return "a" in detokenized
Example #3
Source File: cached_transformers.py From allennlp with Apache License 2.0 | 5 votes |
def get_tokenizer(model_name: str, **kwargs) -> transformers.PreTrainedTokenizer: cache_key = (model_name, frozenset(kwargs.items())) global _tokenizer_cache tokenizer = _tokenizer_cache.get(cache_key, None) if tokenizer is None: tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **kwargs) _tokenizer_cache[cache_key] = tokenizer return tokenizer
Example #4
Source File: dataset.py From gpt-2-output-dataset with MIT License | 5 votes |
def __init__(self, real_texts: List[str], fake_texts: List[str], tokenizer: PreTrainedTokenizer, max_sequence_length: int = None, min_sequence_length: int = None, epoch_size: int = None, token_dropout: float = None, seed: int = None): self.real_texts = real_texts self.fake_texts = fake_texts self.tokenizer = tokenizer self.max_sequence_length = max_sequence_length self.min_sequence_length = min_sequence_length self.epoch_size = epoch_size self.token_dropout = token_dropout self.random = np.random.RandomState(seed)
Example #5
Source File: language_modeling_utils.py From simpletransformers with Apache License 2.0 | 5 votes |
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling." "Set 'mlm' to False in args if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training # (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens if args.model_type == "electra" and False: # For ELECTRA, we replace all masked input tokens with tokenizer.mask_token inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) else: # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
Example #6
Source File: test_tokenization_utils.py From exbert with Apache License 2.0 | 5 votes |
def check_tokenizer_from_pretrained(self, tokenizer_class): s3_models = list(tokenizer_class.max_model_input_sizes.keys()) for model_name in s3_models[:1]: tokenizer = tokenizer_class.from_pretrained(model_name) self.assertIsNotNone(tokenizer) self.assertIsInstance(tokenizer, tokenizer_class) self.assertIsInstance(tokenizer, PreTrainedTokenizer) for special_tok in tokenizer.all_special_tokens: self.assertIsInstance(special_tok, str) special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) self.assertIsInstance(special_tok_id, int)
Example #7
Source File: language_modeling.py From catalyst with Apache License 2.0 | 4 votes |
def __init__( self, texts: Iterable[str], tokenizer: Union[str, PreTrainedTokenizer], max_seq_length: int = None, sort: bool = True, lazy: bool = False, ): """ Args: texts (Iterable): Iterable object with text tokenizer (str or tokenizer): pre trained huggingface tokenizer or model name max_seq_length (int): max sequence length to tokenize sort (bool): If True then sort all sequences by length for efficient padding lazy (bool): If True then tokenize and encode sequence in __getitem__ method else will tokenize in __init__ also if set to true sorting is unavialible """ if sort and lazy: raise Exception( "lazy is set to True so we can't sort" " sequences by length.\n" "You should set sort=False and lazy=True" " if you want to encode text in __get_item__ function" ) if isinstance(tokenizer, str): self.tokenizer = AutoTokenizer.from_pretrained(tokenizer) elif isinstance( tokenizer, transformers.tokenization_utils.PreTrainedTokenizer ): self.tokenizer = tokenizer else: raise TypeError( "tokenizer argument should be a model name" + " or huggingface PreTrainedTokenizer" ) self.max_seq_length = max_seq_length self.lazy = lazy if lazy: self.texts = texts if not lazy: pbar = tqdm(texts, desc="tokenizing texts") self.encoded = [ self.tokenizer.encode(text, max_length=max_seq_length) for text in pbar ] if sort: self.encoded.sort(key=len) self.length = len(texts) self._getitem_fn = ( self._getitem_lazy if lazy else self._getitem_encoded )