Python transformers.BertTokenizer() Examples

The following are 16 code examples of transformers.BertTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module transformers , or try the search function

Example #1

Source File: loader.py From textclf with MIT License

6 votes

def build_loader(pairs, dictionary_or_tokenizer, label2id, config):
    pairs = [(text, label2id[label]) for text, label in pairs]
    if isinstance(dictionary_or_tokenizer, Dictionary):
        col_fn = partial(collate_fn, dictionary_or_tokenizer, config.max_len)
    elif isinstance(dictionary_or_tokenizer, BertTokenizer):
        col_fn = partial(bert_collate_fn, dictionary_or_tokenizer, config.max_len)

    loader = DataLoader(
        dataset=TextClfDataset(pairs),
        collate_fn=col_fn,
        batch_size=config.batch_size,
        shuffle=config.shuffle,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory,
        drop_last=config.drop_last
    )
    return loader

Example #2

Source File: loader.py From textclf with MIT License

6 votes

def bert_collate_fn(
    tokenizer: BertTokenizer,
    max_len: int,
    pairs: Iterable[Tuple[str, int]]
):
    pairs = [(text.split()[:max_len], label) for text, label in pairs]
    texts, labels = zip(*pairs)
    labels = torch.LongTensor(labels)
    # +1 for [CLS] token
    text_lens = torch.LongTensor([len(text)+1 for text in texts])
    max_len = text_lens.max().item()
    ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id
    for i, text in enumerate(texts):
        ids[i][:len(text)+1] = torch.LongTensor(
            tokenizer.encode(text, add_special_tokens=True)[:-1])
    return ids, text_lens, labels

Example #3

Source File: squad_QSL.py From inference with Apache License 2.0

5 votes

def __init__(self, perf_count=None, cache_path='eval_features.pickle'):
        print("Constructing QSL...")
        eval_features = []
        # Load features if cached, convert from examples otherwise.
        if os.path.exists(cache_path):
            print("Loading cached features from '%s'..." % cache_path)
            with open(cache_path, 'rb') as cache_file:
                eval_features = pickle.load(cache_file)
        else:
            print("No cached features at '%s'... converting from examples..." % cache_path)

            print("Creating tokenizer...")
            tokenizer = BertTokenizer("build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt")

            print("Reading examples...")
            eval_examples = read_squad_examples(input_file="build/data/dev-v1.1.json",
                is_training=False, version_2_with_negative=False)

            print("Converting examples to features...")
            def append_feature(feature):
                eval_features.append(feature)

            convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                is_training=False,
                output_fn=append_feature,
                verbose_logging=False)

            print("Caching features at '%s'..." % cache_path)
            with open(cache_path, 'wb') as cache_file:
                pickle.dump(eval_features, cache_file)

        self.eval_features = eval_features
        self.count = len(self.eval_features)
        self.perf_count = perf_count if perf_count is not None else self.count
        self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples)
        print("Finished constructing QSL.")

Example #4

Source File: seq2seq_gpt2.py From MultiTurnDialogZoo with MIT License

5 votes

def __init__(self, config_path):
        super(transformer_gpt2, self).__init__()
        self.tokenzier = BertTokenizer(vocab_file='config/vocab_en.txt')
        self.vocab_size = len(self.tokenzier)
        
        self.model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(config_path)
        self.model = GPT2LMHeadModel(config=self.model_config)
        self.model.resize_token_embeddings(self.vocab_size)
        
        self.n_ctx = self.model.config.to_dict().get('n_ctx')

Example #5

Source File: bert_processor.py From Bert-Multi-Label-Text-Classification with MIT License

5 votes

def __init__(self,vocab_path,do_lower_case):
        self.tokenizer = BertTokenizer(vocab_path,do_lower_case)

Example #6

Source File: test_multi_turn_dialog.py From cotk with Apache License 2.0

5 votes

def load_ubuntucorpus_bert():
	def _load_ubuntucorpus(min_rare_vocab_times=0):
		from transformers import BertTokenizer
		toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
		return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert")
	return _load_ubuntucorpus

Example #7

Source File: test_multi_turn_dialog.py From cotk with Apache License 2.0

5 votes

def load_switchboardcorpus_bert():
	def _load_switchboardcorpus(min_rare_vocab_times=0):
		from transformers import BertTokenizer
		toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
		return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus",
								 min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert")

	return _load_switchboardcorpus

Example #8

Source File: test_single_turn_dialog.py From cotk with Apache License 2.0

5 votes

def load_opensubtitles_bert():
	def _load_opensubtitles(invalid_vocab_times=0):
		from transformers import BertTokenizer
		toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
		return OpenSubtitles("./tests/dataloader/dummy_opensubtitles#OpenSubtitles", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times)
	return _load_opensubtitles

Example #9

Source File: test_sentence_classification.py From cotk with Apache License 2.0

5 votes

def load_sst_bert():
	def _load_sst(min_rare_vocab_times=0):
		from transformers import BertTokenizer
		toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
		return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="bert")
	return _load_sst

Example #10

Source File: test_language_generation.py From cotk with Apache License 2.0

5 votes

def load_mscoco_bert():
	def _load_mscoco(invalid_vocab_times=0):
		from transformers import BertTokenizer
		toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt'))
		return MSCOCO("./tests/dataloader/dummy_mscoco#MSCOCO", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times)
	return _load_mscoco

Example #11

Source File: tokenization.py From bert-japanese with Apache License 2.0

5 votes

def __init__(self, vocab_file, do_lower_case=False,
                 do_basic_tokenize=True, do_wordpiece_tokenize=True,
                 mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]',
                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with MeCab before wordpiece.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
        """
        super(BertTokenizer, self).__init__(
            unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
            cls_token=cls_token, mask_token=mask_token, **kwargs)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case,
                                                       mecab_dict_path=mecab_dict_path)

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                          unk_token=self.unk_token)

Example #12

Source File: tokenization.py From bert-japanese with Apache License 2.0

5 votes

def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True,
                 mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]',
                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
        """Constructs a MecabCharacterBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with MeCab before wordpiece.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
        """
        super(BertTokenizer, self).__init__(
            unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
            cls_token=cls_token, mask_token=mask_token, **kwargs)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case,
                                                       mecab_dict_path=mecab_dict_path,
                                                       preserve_spaces=True)

        self.wordpiece_tokenizer = CharacterTokenizer(vocab=self.vocab,
                                                      unk_token=self.unk_token,
                                                      with_markers=True)

Example #13

Source File: transformers_preprocessor.py From DeepPavlov with Apache License 2.0

5 votes

def __init__(self, vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 tokenize_chinese_chars: bool = True,
                 **kwargs):
        vocab_file = expand_path(vocab_file)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case,
                                       tokenize_chinese_chars=tokenize_chinese_chars)
        self.max_seq_length = max_seq_length

Example #14

Source File: dl_tester.py From textclf with MIT License

5 votes

def _preprocess(self, text):
        text_tokenized = self.tokenizer(text)
        if isinstance(self.dictionary, Dictionary):
            text_processed = self.dictionary.tokens_to_tensor(
                text_tokenized, max_len=self.config.max_len
            )
            text_len = (text_processed != self.dictionary.pad()).sum()
        elif isinstance(self.dictionary, BertTokenizer):
            text_processed = torch.LongTensor(
                self.dictionary.encode(text_tokenized, add_special_tokens=True)[:-1])
            max_len = self.config.max_len
            pad_id = self.dictionary.pad_token_id
            if len(text_processed) >= max_len:
                text_processed = text_processed[:max_len]
            else:
                text_processed = torch.cat([
                    text_processed,
                    torch.ones(max_len-len(text_processed)).long()*pad_id
                ])
            text_len = (text_processed != pad_id).sum()

        if self.use_cuda:
            text_processed = text_processed.cuda()
            text_len = text_len.cuda()

        return text_processed.unsqueeze(0), text_len.unsqueeze(0)

Example #15

Source File: squad_eval.py From inference with Apache License 2.0

4 votes

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--vocab_file", default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt", help="Path to vocab.txt")
    parser.add_argument("--val_data", default="build/data/dev-v1.1.json", help="Path to validation data")
    parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json", help="Path to LoadGen accuracy log")
    parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file")
    parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file")
    parser.add_argument("--output_transposed", action="store_true", help="Transpose the output")
    args = parser.parse_args()

    print("Reading examples...")
    eval_examples = read_squad_examples(input_file=args.val_data,
        is_training=False, version_2_with_negative=False)

    eval_features = []
    # Load features if cached, convert from examples otherwise.
    cache_path = args.features_cache_file
    if os.path.exists(cache_path):
        print("Loading cached features from '%s'..." % cache_path)
        with open(cache_path, 'rb') as cache_file:
            eval_features = pickle.load(cache_file)
    else:
        print("No cached features at '%s'... converting from examples..." % cache_path)

        print("Creating tokenizer...")
        tokenizer = BertTokenizer(args.vocab_file)

        print("Converting examples to features...")
        def append_feature(feature):
            eval_features.append(feature)

        convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
            output_fn=append_feature,
            verbose_logging=False)

        print("Caching features at '%s'..." % cache_path)
        with open(cache_path, 'wb') as cache_file:
            pickle.dump(eval_features, cache_file)

    print("Loading LoadGen logs...")
    results = load_loadgen_log(args.log_file, eval_features, args.output_transposed)

    print("Post-processing predictions...")
    write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file)

    print("Evaluating predictions...")
    cmd = "python3 build/data/evaluate-v1.1.py build/data/dev-v1.1.json build/result/predictions.json"
    subprocess.check_call(cmd, shell=True)

Example #16

Source File: utils.py From MultiTurnDialogZoo with MIT License

4 votes

def transformer_preprocess(src_path, tgt_path, tokenized_file, 
                           vocab_file='./config/vocab_en.txt', ctx=200):
    '''
    tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file.
    more details can be found in https://github.com/yangjianxin1/GPT2-chitchat
    '''
    def clean_inside(s):
        s = s.replace('<user0>', '')
        s = s.replace('<user1>', '')
        s = s.strip()
        s = clean(s)
        return s
        
    # create the Bert tokenizer of the GPT2 model
    tokenizer = BertTokenizer(vocab_file=vocab_file)
    
    src_data, tgt_data = read_file(src_path), read_file(tgt_path)
    src_data = [' '.join(i) for i in src_data]
    tgt_data = [' '.join(i) for i in tgt_data]
    assert len(src_data) == len(tgt_data), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}'
    
    # combine them
    corpus = []
    longest = 0
    for s, t in tqdm(list(zip(src_data, tgt_data))):
        item = [tokenizer.cls_token_id]   # [CLS] for each dialogue in the begining
        s = s + ' __eou__ ' + t
        s = clean_inside(s)
        utterances = s.split('__eou__')
        for utterance in utterances:
            words = nltk.word_tokenize(utterance)
            item.extend([tokenizer.convert_tokens_to_ids(word) for word in words])
            item.append(tokenizer.sep_token_id)
        if len(item) > longest:
            longest = len(item)
        item = item[:ctx]
        corpus.append(item)
        
    # write into the file
    with open(tokenized_file, 'w') as f:
        for i in range(len(corpus)):
            words = [str(word) for word in corpus[i]]
            f.write(f'{" ".join(words)}')
            if i < len(corpus) - 1:
                f.write('\n')
                
    print(f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.')
    
    
# From https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Optim.py
# ========== lr scheduler for transformer ==========