Python transformers.BertTokenizer() Examples
The following are 16
code examples of transformers.BertTokenizer().
Example #1
Source File: From textclf with MIT License | 6 votes |
def build_loader(pairs, dictionary_or_tokenizer, label2id, config): pairs = [(text, label2id[label]) for text, label in pairs] if isinstance(dictionary_or_tokenizer, Dictionary): col_fn = partial(collate_fn, dictionary_or_tokenizer, config.max_len) elif isinstance(dictionary_or_tokenizer, BertTokenizer): col_fn = partial(bert_collate_fn, dictionary_or_tokenizer, config.max_len) loader = DataLoader( dataset=TextClfDataset(pairs), collate_fn=col_fn, batch_size=config.batch_size, shuffle=config.shuffle, num_workers=config.num_workers, pin_memory=config.pin_memory, drop_last=config.drop_last ) return loader
Example #2
Source File: From textclf with MIT License | 6 votes |
def bert_collate_fn( tokenizer: BertTokenizer, max_len: int, pairs: Iterable[Tuple[str, int]] ): pairs = [(text.split()[:max_len], label) for text, label in pairs] texts, labels = zip(*pairs) labels = torch.LongTensor(labels) # +1 for [CLS] token text_lens = torch.LongTensor([len(text)+1 for text in texts]) max_len = text_lens.max().item() ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id for i, text in enumerate(texts): ids[i][:len(text)+1] = torch.LongTensor( tokenizer.encode(text, add_special_tokens=True)[:-1]) return ids, text_lens, labels
Example #3
Source File: From inference with Apache License 2.0 | 5 votes |
def __init__(self, perf_count=None, cache_path='eval_features.pickle'): print("Constructing QSL...") eval_features = [] # Load features if cached, convert from examples otherwise. if os.path.exists(cache_path): print("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: print("No cached features at '%s'... converting from examples..." % cache_path) print("Creating tokenizer...") tokenizer = BertTokenizer("build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt") print("Reading examples...") eval_examples = read_squad_examples(input_file="build/data/dev-v1.1.json", is_training=False, version_2_with_negative=False) print("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) self.eval_features = eval_features self.count = len(self.eval_features) self.perf_count = perf_count if perf_count is not None else self.count self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
Example #4
Source File: From MultiTurnDialogZoo with MIT License | 5 votes |
def __init__(self, config_path): super(transformer_gpt2, self).__init__() self.tokenzier = BertTokenizer(vocab_file='config/vocab_en.txt') self.vocab_size = len(self.tokenzier) self.model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(config_path) self.model = GPT2LMHeadModel(config=self.model_config) self.model.resize_token_embeddings(self.vocab_size) self.n_ctx = self.model.config.to_dict().get('n_ctx')
Example #5
Source File: From Bert-Multi-Label-Text-Classification with MIT License | 5 votes |
def __init__(self,vocab_path,do_lower_case): self.tokenizer = BertTokenizer(vocab_path,do_lower_case)
Example #6
Source File: From cotk with Apache License 2.0 | 5 votes |
def load_ubuntucorpus_bert(): def _load_ubuntucorpus(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert") return _load_ubuntucorpus
Example #7
Source File: From cotk with Apache License 2.0 | 5 votes |
def load_switchboardcorpus_bert(): def _load_switchboardcorpus(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="bert") return _load_switchboardcorpus
Example #8
Source File: From cotk with Apache License 2.0 | 5 votes |
def load_opensubtitles_bert(): def _load_opensubtitles(invalid_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return OpenSubtitles("./tests/dataloader/dummy_opensubtitles#OpenSubtitles", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times) return _load_opensubtitles
Example #9
Source File: From cotk with Apache License 2.0 | 5 votes |
def load_sst_bert(): def _load_sst(min_rare_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="bert") return _load_sst
Example #10
Source File: From cotk with Apache License 2.0 | 5 votes |
def load_mscoco_bert(): def _load_mscoco(invalid_vocab_times=0): from transformers import BertTokenizer toker = PretrainedTokenizer(BertTokenizer('./tests/dataloader/dummy_bertvocab/vocab.txt')) return MSCOCO("./tests/dataloader/dummy_mscoco#MSCOCO", tokenizer=toker, pretrained='bert', min_rare_vocab_times=invalid_vocab_times) return _load_mscoco
Example #11
Source File: From bert-japanese with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
Example #12
Source File: From bert-japanese with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabCharacterBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path, preserve_spaces=True) self.wordpiece_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token, with_markers=True)
Example #13
Source File: From DeepPavlov with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file: str, do_lower_case: bool = False, max_seq_length: int = 512, tokenize_chinese_chars: bool = True, **kwargs): vocab_file = expand_path(vocab_file) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, tokenize_chinese_chars=tokenize_chinese_chars) self.max_seq_length = max_seq_length
Example #14
Source File: From textclf with MIT License | 5 votes |
def _preprocess(self, text): text_tokenized = self.tokenizer(text) if isinstance(self.dictionary, Dictionary): text_processed = self.dictionary.tokens_to_tensor( text_tokenized, max_len=self.config.max_len ) text_len = (text_processed != self.dictionary.pad()).sum() elif isinstance(self.dictionary, BertTokenizer): text_processed = torch.LongTensor( self.dictionary.encode(text_tokenized, add_special_tokens=True)[:-1]) max_len = self.config.max_len pad_id = self.dictionary.pad_token_id if len(text_processed) >= max_len: text_processed = text_processed[:max_len] else: text_processed =[ text_processed, torch.ones(max_len-len(text_processed)).long()*pad_id ]) text_len = (text_processed != pad_id).sum() if self.use_cuda: text_processed = text_processed.cuda() text_len = text_len.cuda() return text_processed.unsqueeze(0), text_len.unsqueeze(0)
Example #15
Source File: From inference with Apache License 2.0 | 4 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt", help="Path to vocab.txt") parser.add_argument("--val_data", default="build/data/dev-v1.1.json", help="Path to validation data") parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json", help="Path to LoadGen accuracy log") parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file") parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file") parser.add_argument("--output_transposed", action="store_true", help="Transpose the output") args = parser.parse_args() print("Reading examples...") eval_examples = read_squad_examples(input_file=args.val_data, is_training=False, version_2_with_negative=False) eval_features = [] # Load features if cached, convert from examples otherwise. cache_path = args.features_cache_file if os.path.exists(cache_path): print("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: print("No cached features at '%s'... converting from examples..." % cache_path) print("Creating tokenizer...") tokenizer = BertTokenizer(args.vocab_file) print("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) print("Loading LoadGen logs...") results = load_loadgen_log(args.log_file, eval_features, args.output_transposed) print("Post-processing predictions...") write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file) print("Evaluating predictions...") cmd = "python3 build/data/ build/data/dev-v1.1.json build/result/predictions.json" subprocess.check_call(cmd, shell=True)
Example #16
Source File: From MultiTurnDialogZoo with MIT License | 4 votes |
def transformer_preprocess(src_path, tgt_path, tokenized_file, vocab_file='./config/vocab_en.txt', ctx=200): ''' tokenize the dataset for NLG (GPT2), write the tokenized id into the tokenized_file. more details can be found in ''' def clean_inside(s): s = s.replace('<user0>', '') s = s.replace('<user1>', '') s = s.strip() s = clean(s) return s # create the Bert tokenizer of the GPT2 model tokenizer = BertTokenizer(vocab_file=vocab_file) src_data, tgt_data = read_file(src_path), read_file(tgt_path) src_data = [' '.join(i) for i in src_data] tgt_data = [' '.join(i) for i in tgt_data] assert len(src_data) == len(tgt_data), f'[!] length of src and tgt: {len(src_data)}/{len(tgt_data)}' # combine them corpus = [] longest = 0 for s, t in tqdm(list(zip(src_data, tgt_data))): item = [tokenizer.cls_token_id] # [CLS] for each dialogue in the begining s = s + ' __eou__ ' + t s = clean_inside(s) utterances = s.split('__eou__') for utterance in utterances: words = nltk.word_tokenize(utterance) item.extend([tokenizer.convert_tokens_to_ids(word) for word in words]) item.append(tokenizer.sep_token_id) if len(item) > longest: longest = len(item) item = item[:ctx] corpus.append(item) # write into the file with open(tokenized_file, 'w') as f: for i in range(len(corpus)): words = [str(word) for word in corpus[i]] f.write(f'{" ".join(words)}') if i < len(corpus) - 1: f.write('\n') print(f'[!] Preprocess the data for the transformers(GPT2), the longest sentence :{longest}, write the data into {tokenized_file}.') # From # ========== lr scheduler for transformer ==========