Python transformers.BertTokenizer() Examples
The following are 16
code examples of transformers.BertTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
transformers
, or try the search function
.
Example #1
Source File: loader.py From textclf with MIT License | 6 votes |
def build_loader(pairs, dictionary_or_tokenizer, label2id, config): pairs = [(text, label2id[label]) for text, label in pairs] if isinstance(dictionary_or_tokenizer, Dictionary): col_fn = partial(collate_fn, dictionary_or_tokenizer, config.max_len) elif isinstance(dictionary_or_tokenizer, BertTokenizer): col_fn = partial(bert_collate_fn, dictionary_or_tokenizer, config.max_len) loader = DataLoader( dataset=TextClfDataset(pairs), collate_fn=col_fn, batch_size=config.batch_size, shuffle=config.shuffle, num_workers=config.num_workers, pin_memory=config.pin_memory, drop_last=config.drop_last ) return loader
Example #2
Source File: loader.py From textclf with MIT License | 6 votes |
def bert_collate_fn( tokenizer: BertTokenizer, max_len: int, pairs: Iterable[Tuple[str, int]] ): pairs = [(text.split()[:max_len], label) for text, label in pairs] texts, labels = zip(*pairs) labels = torch.LongTensor(labels) # +1 for [CLS] token text_lens = torch.LongTensor([len(text)+1 for text in texts]) max_len = text_lens.max().item() ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id for i, text in enumerate(texts): ids[i][:len(text)+1] = torch.LongTensor( tokenizer.encode(text, add_special_tokens=True)[:-1]) return ids, text_lens, labels
Example #3
Source File: squad_QSL.py From inference with Apache License 2.0 | 5 votes |
def __init__(self, perf_count=None, cache_path='eval_features.pickle'): print("Constructing QSL...") eval_features = [] # Load features if cached, convert from examples otherwise. if os.path.exists(cache_path): print("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: print("No cached features at '%s'... converting from examples..." % cache_path) print("Creating tokenizer...") tokenizer = BertTokenizer("build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt") print("Reading examples...") eval_examples = read_squad_examples(input_file="build/data/dev-v1.1.json", is_training=False, version_2_with_negative=False) print("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) self.eval_features = eval_features self.count = len(self.eval_features) self.perf_count = perf_count if perf_count is not None else self.count self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
Example #4
Source File: seq2seq_gpt2.py From MultiTurnDialogZoo with MIT License | 5 votes |
def __init__(self, config_path): super(transformer_gpt2, self).__init__() self.tokenzier = BertTokenizer(vocab_file='config/vocab_en.txt') self.vocab_size = len(self.tokenzier) self.model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(config_path) self.model = GPT2LMHeadModel(config=self.model_config) self.model.resize_token_embeddings(self.vocab_size) self.n_ctx = self.model.config.to_dict().get('n_ctx')
Example #5
Source File: bert_processor.py From Bert-Multi-Label-Text-Classification with MIT License | 5 votes |
def __init__(self,vocab_path,do_lower_case): self.tokenizer = BertTokenizer(vocab_path,do_lower_case)
Example #6
Source File: test_multi_turn_dialog.py From cotk with Apache License 2.0 | 5 votes |
def load_ubuntucorpus_bert(): def _load_ubuntucorpus(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert") return _load_ubuntucorpus
Example #7
Source File: test_multi_turn_dialog.py From cotk with Apache License 2.0 | 5 votes |
def load_switchboardcorpus_bert(): def _load_switchboardcorpus(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert") return _load_switchboardcorpus
Example #8
Source File: test_single_turn_dialog.py From cotk with Apache License 2.0 | 5 votes |
def load_opensubtitles_bert(): def _load_opensubtitles(invalid_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return OpenSubtitles("./tests/dataloader/dummy_opensubtitles#OpenSubtitles", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times) return _load_opensubtitles
Example #9
Source File: test_sentence_classification.py From cotk with Apache License 2.0 | 5 votes |
def load_sst_bert(): def _load_sst(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="bert") return _load_sst
Example #10
Source File: test_language_generation.py From cotk with Apache License 2.0 | 5 votes |
def load_mscoco_bert(): def _load_mscoco(invalid_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return MSCOCO("./tests/dataloader/dummy_mscoco#MSCOCO", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times) return _load_mscoco
Example #11
Source File: tokenization.py From bert-japanese with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
Example #12
Source File: tokenization.py From bert-japanese with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabCharacterBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path, preserve_spaces=True) self.wordpiece_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token, with_markers=True)
Example #13
Source File: transformers_preprocessor.py From DeepPavlov with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, tokenize_chinese_chars: bool = True, **kwargs): vocab_file = expand_path(vocab_file) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, tokenize_chinese_chars=tokenize_chinese_chars) self.max_seq_length = max_seq_length
Example #14
Source File: dl_tester.py From textclf with MIT License | 5 votes |
def _preprocess(self, text): text_tokenized = self.tokenizer(text) if isinstance(self.dictionary, Dictionary): text_processed = self.dictionary.tokens_to_tensor( text_tokenized, max_len=self.config.max_len ) text_len = (text_processed != self.dictionary.pad()).sum() elif isinstance(self.dictionary, BertTokenizer): text_processed = torch.LongTensor( self.dictionary.encode(text_tokenized, add_special_tokens=True)[:-1]) max_len = self.config.max_len pad_id = self.dictionary.pad_token_id if len(text_processed) >= max_len: text_processed = text_processed[:max_len] else: text_processed = torch.cat([ text_processed, torch.ones(max_len-len(text_processed)).long()*pad_id ]) text_len = (text_processed != pad_id).sum() if self.use_cuda: text_processed = text_processed.cuda() text_len = text_len.cuda() return text_processed.unsqueeze(0), text_len.unsqueeze(0)
Example #15
Source File: squad_eval.py From inference with Apache License 2.0 | 4 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt", help="Path to vocab.txt") parser.add_argument("--val_data", default="build/data/dev-v1.1.json", help="Path to validation data") parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json", help="Path to LoadGen accuracy log") parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file") parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file") parser.add_argument("--output_transposed", action="store_true", help="Transpose the output") args = parser.parse_args() print("Reading examples...") eval_examples = read_squad_examples(input_file=args.val_data, is_training=False, version_2_with_negative=False) eval_features = [] # Load features if cached, convert from examples otherwise. cache_path = args.features_cache_file if os.path.exists(cache_path): print("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: print("No cached features at '%s'... converting from examples..." % cache_path) print("Creating tokenizer...") tokenizer = BertTokenizer(args.vocab_file) print("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) print("Loading LoadGen logs...") results = load_loadgen_log(args.log_file, eval_features, args.output_transposed) print("Post-processing predictions...") write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file) print("Evaluating predictions...") cmd = "python3 build/data/evaluate-v1.1.py build/data/dev-v1.1.json build/result/predictions.json" subprocess.check_call(cmd, shell=True)
Example #16
Source File: utils.py From MultiTurnDialogZoo with MIT License | 4 votes |
def transformer_preprocess(src_path, tgt_path, tokenized_file, vocab_file='./config/vocab_en.txt', ctx=200): ''' tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file. more details can be found in https://github.com/yangjianxin1/GPT2-chitchat ''' def clean_inside(s): s = s.replace('<user0>', '') s = s.replace('<user1>', '') s = s.strip() s = clean(s) return s # create the Bert tokenizer of the GPT2 model tokenizer = BertTokenizer(vocab_file=vocab_file) src_data, tgt_data = read_file(src_path), read_file(tgt_path) src_data = [' '.join(i) for i in src_data] tgt_data = [' '.join(i) for i in tgt_data] assert len(src_data) == len(tgt_data), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}' # combine them corpus = [] longest = 0 for s, t in tqdm(list(zip(src_data, tgt_data))): item = [tokenizer.cls_token_id] # [CLS] for each dialogue in the begining s = s + ' __eou__ ' + t s = clean_inside(s) utterances = s.split('__eou__') for utterance in utterances: words = nltk.word_tokenize(utterance) item.extend([tokenizer.convert_tokens_to_ids(word) for word in words]) item.append(tokenizer.sep_token_id) if len(item) > longest: longest = len(item) item = item[:ctx] corpus.append(item) # write into the file with open(tokenized_file, 'w') as f: for i in range(len(corpus)): words = [str(word) for word in corpus[i]] f.write(f'{" ".join(words)}') if i < len(corpus) - 1: f.write('\n') print(f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.') # From https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Optim.py # ========== lr scheduler for transformer ==========