Python sentencepiece.SentencePieceProcessor() Examples
The following are 30
code examples of sentencepiece.SentencePieceProcessor().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sentencepiece
, or try the search function
.
Example #1
Source File: vocab.py From BERT-keras with GNU General Public License v3.0 | 6 votes |
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm', vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None: super().__init__(vocab_size) if not os.path.exists('{}.model'.format(model_name)): if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'): raise ValueError( '{} is not a valid model_type for sentence piece, ' 'valid options are: unigram, bpe, char, word'.format(spm_model_type)) spm.SentencePieceTrainer.Train( '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} ' '--character_coverage={coverage} --model_type={model_type} ' '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format( input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1, model_type=spm_model_type.lower())) self.sp = spm.SentencePieceProcessor() self.sp.load('{}.model'.format(model_name))
Example #2
Source File: sp_encoder.py From ru_transformers with Apache License 2.0 | 6 votes |
def __init__(self, filename, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens if os.path.isdir(filename): filename = os.path.join(filename, self.def_name) self.sp = spm.SentencePieceProcessor() self.sp.Load(filename) self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10] self.filename = filename # for some reason SentencePiece inserts a blank line id before special token if that is the only # token in the line. I'd like to remove that blank line id from encoding. nl_ids = self.sp.EncodeAsIds(NEW_LINE) assert(len(nl_ids) == 2) self.blank_line_id = nl_ids[0]
Example #3
Source File: tokenizers.py From virtex with MIT License | 6 votes |
def __init__(self, vocab_path: str, model_path: str): self.vocab_path = vocab_path self.model_path = model_path # Load pretrained tokenizer model. self.model = sp.SentencePieceProcessor() self.model.Load(model_path) # Load vocabulary mapping (and inverse mapping) between token and id. self._token_to_id: Dict[str, int] = {} self._id_to_token: Dict[int, str] = {} with open(vocab_path, "r") as vocab_file: reader = csv.DictReader( vocab_file, delimiter="\t", fieldnames=["token", "logprob"] ) for index, row in enumerate(reader): self._token_to_id[row["token"]] = index self._id_to_token[index] = row["token"]
Example #4
Source File: tokenization_xlnet.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", additional_special_tokens=["<eop>", "<eod>"], **kwargs): super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens= additional_special_tokens, **kwargs) try: import sentencepiece as spm except ImportError: logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file)
Example #5
Source File: gdqn.py From KG-A2C with MIT License | 6 votes |
def __init__(self, params): configure_logger(params['output_dir']) log('Parameters {}'.format(params)) self.params = params self.binding = load_bindings(params['rom_file_path']) self.max_word_length = self.binding['max_word_length'] self.sp = spm.SentencePieceProcessor() self.sp.Load(params['spm_file']) kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp, params['tsv_file'], step_limit=params['reset_steps'], stuck_steps=params['stuck_steps'], gat=params['gat']) self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path']) self.template_generator = TemplateActionGenerator(self.binding) env = FrotzEnv(params['rom_file_path']) self.vocab_act, self.vocab_act_rev = load_vocab(env) self.model = KGA2C(params, self.template_generator.templates, self.max_word_length, self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda() self.batch_size = params['batch_size'] if params['preload_weights']: self.model = torch.load(self.params['preload_weights'])['model'] self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.loss_fn1 = nn.BCELoss() self.loss_fn2 = nn.BCEWithLogitsLoss() self.loss_fn3 = nn.MSELoss()
Example #6
Source File: tokenization.py From albert with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): self.vocab = None self.sp_model = None if spm_model_file: self.sp_model = spm.SentencePieceProcessor() tf.logging.info("loading sentence piece model") # Handle cases where SP can't load the file, but gfile can. sp_model_ = tf.gfile.GFile(spm_model_file, "rb").read() self.sp_model.LoadFromSerializedProto(sp_model_) # Note(mingdachen): For the purpose of consisent API, we are # generating a vocabulary for the sentence piece tokenizer. self.vocab = {self.sp_model.IdToPiece(i): i for i in range(self.sp_model.GetPieceSize())} else: self.vocab = load_vocab(vocab_file) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.inv_vocab = {v: k for k, v in self.vocab.items()}
Example #7
Source File: vocab.py From BERT with Apache License 2.0 | 6 votes |
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm', vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None: super().__init__(vocab_size) if not os.path.exists('{}.model'.format(model_name)): if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'): raise ValueError( '{} is not a valid model_type for sentence piece, ' 'valid options are: unigram, bpe, char, word'.format(spm_model_type)) spm.SentencePieceTrainer.Train( '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} ' '--character_coverage={coverage} --model_type={model_type} ' '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 ' '--training_sentence_size=100000000'.format( input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1, model_type=spm_model_type.lower())) self.sp = spm.SentencePieceProcessor() self.sp.load('{}.model'.format(model_name))
Example #8
Source File: tokenizers.py From bert4keras with Apache License 2.0 | 6 votes |
def __init__(self, sp_model_path, *args, **kwargs): super(SpTokenizer, self).__init__(*args, **kwargs) import sentencepiece as spm self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(sp_model_path) self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id()) self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id()) self._vocab_size = self.sp_model.get_piece_size() for token in ['pad', 'unk', 'mask', 'start', 'end']: try: _token = getattr(self, '_token_%s' % token) _token_id = self.sp_model.piece_to_id(_token) setattr(self, '_token_%s_id' % token, _token_id) except: pass
Example #9
Source File: test_sentencepiece_tokenizer.py From espnet with Apache License 2.0 | 6 votes |
def spm_srcs(tmp_path: Path): input_text = tmp_path / "text" vocabsize = len(string.ascii_letters) + 4 model_prefix = tmp_path / "model" model = str(model_prefix) + ".model" input_sentence_size = 100000 with input_text.open("w") as f: f.write(string.ascii_letters + "\n") spm.SentencePieceTrainer.Train( f"--input={input_text} " f"--vocab_size={vocabsize} " f"--model_prefix={model_prefix} " f"--input_sentence_size={input_sentence_size}" ) sp = spm.SentencePieceProcessor() sp.load(model) with input_text.open("r") as f: vocabs = {"<unk>", "▁"} for line in f: tokens = sp.DecodePieces(list(line.strip())) vocabs |= set(tokens) return model, vocabs
Example #10
Source File: prepare_data.py From Decoders-Chinese-TF2.0 with MIT License | 6 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='sentencepiece模型地址') parser.add_argument('--raw_data_path', default='data/train_test.txt', type=str, required=False, help='原始语料地址') parser.add_argument('--save_tfrecord_path', default='data/tokenized/', type=str, required=False, help='处理后的语料存放地址') parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录句子长度') parser.add_argument('--n_ctx', default=512, type=int, required=False, help='每个训练样本的长度') parser.add_argument('--batch_size', default=8, type=int, required=False, help='只用于XL模型,XL模型的batch size,GPT2设置为1') parser.add_argument('--pad', default=0, type=int, required=False, help='PAD值') parser.add_argument('--epochs', default=1, type=int, required=False, help='只用于XL模型,GPT2设置为1') args = parser.parse_args() print('args:\n' + args.__repr__()) ch_sp = spm.SentencePieceProcessor() ch_sp.Load(args.spm_model_path) build_tfrecord(args.raw_data_path, args.save_tfrecord_path, ch_sp, args.min_length, args.n_ctx, args.batch_size, pad=args.pad, epochs=args.epochs)
Example #11
Source File: tokenization.py From embedding-as-service with MIT License | 6 votes |
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): self.vocab = None self.sp_model = None if spm_model_file: self.sp_model = spm.SentencePieceProcessor() tf.logging.info("loading sentence piece model") self.sp_model.Load(spm_model_file) # Note(mingdachen): For the purpose of consisent API, we are # generating a vocabulary for the sentence piece tokenizer. self.vocab = {self.sp_model.IdToPiece(i): i for i in range(self.sp_model.GetPieceSize())} else: self.vocab = load_vocab(vocab_file) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.inv_vocab = {v: k for k, v in self.vocab.items()}
Example #12
Source File: translate_server.py From nlp-services with MIT License | 6 votes |
def translate_text(text, source, target): if source == target: # The easy case ;-) return text t = translations[source][target] s = spm.SentencePieceProcessor() s.Load(os.path.join(ROOT_DIR, 'models', t["sentencepiece_model"])) pieces = s.encode_as_pieces(text) # Ensure any trailing words without terminating punctuation is also translated. if pieces[-1] != '.': pieces.append('.') # For other languages we will need a better system for chunking sentences or parts of text. indices = [i for i, _x in enumerate(pieces) if _x in [".", "!", "?"]] complete_result = [] start=0 for i in indices: x = " ".join([e for e in pieces[start:i+1]]) result = _translate(x, translate_model=t['translate_model']) y = s.decode_pieces(result[1][0].split(" ")) complete_result.append(y) start = i return "\n".join(complete_result)
Example #13
Source File: tokenization_albert.py From Bert-Multi-Label-Text-Classification with MIT License | 6 votes |
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): self.vocab = None self.sp_model = None print(spm_model_file) if spm_model_file: self.sp_model = spm.SentencePieceProcessor() logger.info("loading sentence piece model") self.sp_model.Load(str(spm_model_file)) # # Note(mingdachen): For the purpose of consisent API, we are # # generating a vocabulary for the sentence piece tokenizer. self.vocab = {self.sp_model.IdToPiece(i): i for i in range(self.sp_model.GetPieceSize())} else: print("load vocab") self.vocab = load_vocab(vocab_file) print("load token") self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100) self.inv_vocab = {v: k for k, v in self.vocab.items()}
Example #14
Source File: test_functional.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_generate_sp_model(self): """ Test the function to train a sentencepiece tokenizer. """ asset_name = 'text_normalization_ag_news_test.csv' asset_path = get_asset_path(asset_name) # We use temporary directory for two reasons: # 1. buck (fb internal) generates test environment which contains ',' in its path. # SentencePieceTrainer considers such path as comma-delimited file list. # So as workaround we copy the asset data to temporary directory and load it from there. # 2. when fb infra performs stress tests, multiple instances of this test run. # The name of the generated models have to be unique and they need to be cleaned up. with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) model_prefix = os.path.join(dir_name, f'spm_user_{uuid.uuid4()}') model_file = f'{model_prefix}.model' generate_sp_model(data_path, vocab_size=23456, model_prefix=model_prefix) sp_user = spm.SentencePieceProcessor() sp_user.Load(model_file) self.assertEqual(len(sp_user), 23456)
Example #15
Source File: pre_process.py From gpt-2-tensorflow2.0 with MIT License | 6 votes |
def create_tf_records(min_seq_len, max_seq_len, per_file_limit=50000): print("Creating TF Records...............") s = spm.SentencePieceProcessor() s.Load(BPE_MODEL_PATH + ".model") if not os.path.exists(TF_RECORDS): os.makedirs(TF_RECORDS) filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord" tf_writer = tf.io.TFRecordWriter(filename) doc_counts = 0 with open(PROCESS_DATA_PATH, 'r') as f: for line in tqdm.tqdm(f): encoded_id = s.encode_as_ids(line) if max_seq_len > len(encoded_id) > min_seq_len: inputs = np.array([BOS_ID] + encoded_id) targets = np.array(encoded_id + [EOS_ID]) example = serialize_example(inputs, targets) tf_writer.write(example) doc_counts += 1 if doc_counts >= per_file_limit: tf_writer.write(example) doc_counts = 0 tf_writer.close() filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord" tf_writer = tf.io.TFRecordWriter(filename)
Example #16
Source File: albert_tokenization.py From bert-for-tf2 with MIT License | 6 votes |
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): self.vocab = None self.sp_model = None if spm_model_file: import sentencepiece as spm self.sp_model = spm.SentencePieceProcessor() tf.compat.v1.logging.info("loading sentence piece model") self.sp_model.Load(spm_model_file) # Note(mingdachen): For the purpose of consisent API, we are # generating a vocabulary for the sentence piece tokenizer. self.vocab = {self.sp_model.IdToPiece(i): i for i in range(self.sp_model.GetPieceSize())} else: self.vocab = load_vocab(vocab_file) self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.inv_vocab = {v: k for k, v in self.vocab.items()}
Example #17
Source File: sample.py From gpt-2-tensorflow2.0 with MIT License | 6 votes |
def load_weights(self): with open(self.model_param) as f: param = json.load(f) self.model = Gpt2(param['num_layers'], param['d_model'], param['num_heads'], param['dff'], param['max_seq_len'], param['vocab_size']) ckpt = tf.train.Checkpoint(model=self.model) ckpt_manager = tf.train.CheckpointManager(ckpt, self.model_path, max_to_keep=1) ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial() print('Model weights loaded into memory') self.sp = spm.SentencePieceProcessor() self.sp.load(self.vocab_path)
Example #18
Source File: tokenization_albert.py From albert_pytorch with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None): self.vocab = None self.sp_model = None if spm_model_file: self.sp_model = spm.SentencePieceProcessor() logger.info("loading sentence piece model") self.sp_model.Load(spm_model_file) # # Note(mingdachen): For the purpose of consisent API, we are # # generating a vocabulary for the sentence piece tokenizer. self.vocab = {self.sp_model.IdToPiece(i): i for i in range(self.sp_model.GetPieceSize())} else: print("load vocab") self.vocab = load_vocab(vocab_file) print("load token") self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100) self.inv_vocab = {v: k for k, v in self.vocab.items()}
Example #19
Source File: bpevocabulary.py From dpu-utils with MIT License | 6 votes |
def __init__(self, max_size: int, sentencepiece_model_filepath: Optional[str]=None, bos_token: str="<s>", eos_token: str="</s>", unk_token: str="<unk>", pad_token: str="<pad>", user_defined_symbols: Optional[List[str]] = None, control_symbols: Optional[List[str]]=None) -> None: self.__max_size=max_size self.__bos_token=bos_token self.__eos_token=eos_token self.__unk_token=unk_token self.__pad_token=pad_token self.vocab_file = sentencepiece_model_filepath if user_defined_symbols is None: user_defined_symbols = [] self.user_defined_symbols=",".join(user_defined_symbols) if control_symbols is None: control_symbols = self.DEFAULT_CONTROL_SYMBOLS self.control_symbols=",".join(control_symbols) self.__sp_model = spm.SentencePieceProcessor() if sentencepiece_model_filepath is not None: self.__load_model_from_filepath(sentencepiece_model_filepath) #region Custom Pickling
Example #20
Source File: tokenization_xlnet.py From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, max_len=None, do_lower_case=False, remove_space=True, keep_accents=False, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", additional_special_tokens=["<eop>", "<eod>"], **kwargs): super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens= additional_special_tokens, **kwargs) try: import sentencepiece as spm except ImportError: logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file)
Example #21
Source File: sentencepiece_tokenizer.py From espnet with Apache License 2.0 | 5 votes |
def __init__(self, model: Union[Path, str]): assert check_argument_types() self.model = str(model) # NOTE(kamo): # Don't build SentencePieceProcessor in __init__() # because it's not picklable and it may cause following error, # "TypeError: can't pickle SwigPyObject objects", # when giving it as argument of "multiprocessing.Process()". self.sp = None
Example #22
Source File: generate_gpt2_keras.py From Decoders-Chinese-TF2.0 with MIT License | 5 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='生成设备') parser.add_argument('--length', default=50, type=int, required=False, help='生成长度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=5, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0.95, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='configs/gpt2/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='') parser.add_argument('--model_path', default='model/', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='丨', type=str, required=False, help='生成文章的开头') parser.add_argument('--repetition_penalty', default=1.1, type=float, required=False) parser.add_argument('--n_ctx', default=512, type=int) args = parser.parse_args() print('args:\n' + args.__repr__()) model = modeling_gpt2.TFGPT2LMHeadModel.from_pretrained(args.model_path) length = args.length temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty n_ctx = args.n_ctx ch_sp = spm.SentencePieceProcessor() ch_sp.Load(args.spm_model_path) while True: context = ch_sp.encode_as_ids(args.prefix) generated = sample_sequence(model, context, length, n_ctx, ch_sp, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty) print(ch_sp.decode_ids(generated))
Example #23
Source File: tokenization_xlm_roberta.py From exbert with Apache License 2.0 | 5 votes |
def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file)
Example #24
Source File: bpe_tokenizer.py From GPT2-Chinese with MIT License | 5 votes |
def __init__(self, model_path): self.sp = spm.SentencePieceProcessor() self.sp.Load(model_path)
Example #25
Source File: wordpiece.py From neural_sp with Apache License 2.0 | 5 votes |
def __init__(self, dict_path, wp_model): # Load a dictionary file self.idx2token = {0: '<blank>'} with codecs.open(dict_path, 'r', 'utf-8') as f: for line in f: wp, idx = line.strip().split(' ') self.idx2token[int(idx)] = wp self.vocab = len(self.idx2token.keys()) # for synchronous bidirectional attention self.idx2token[self.vocab] = '<l2r>' self.idx2token[self.vocab + 1] = '<r2l>' self.idx2token[self.vocab + 2] = '<null>' self.sp = spm.SentencePieceProcessor() self.sp.Load(wp_model)
Example #26
Source File: wordpiece.py From neural_sp with Apache License 2.0 | 5 votes |
def __init__(self, dict_path, wp_model): # Load a dictionary file self.token2idx = {'<blank>': 0} with codecs.open(dict_path, 'r', 'utf-8') as f: for line in f: wp, idx = line.strip().split(' ') self.token2idx[wp] = int(idx) self.vocab = len(self.token2idx.keys()) self.sp = spm.SentencePieceProcessor() self.sp.Load(wp_model)
Example #27
Source File: run_squad.py From embedding-as-service with MIT License | 5 votes |
def preprocess(): sp_model = spm.SentencePieceProcessor() sp_model.Load(FLAGS.spiece_model_file) spm_basename = _get_spm_basename() train_rec_file = os.path.join( FLAGS.output_dir, "{}.{}.slen-{}.qlen-{}.train.tf_record".format( spm_basename, FLAGS.proc_id, FLAGS.max_seq_length, FLAGS.max_query_length)) tf.logging.info("Read examples from {}".format(FLAGS.train_file)) train_examples = read_squad_examples(FLAGS.train_file, is_training=True) train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc] # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in the `input_fn`. random.shuffle(train_examples) tf.logging.info("Write to {}".format(train_rec_file)) train_writer = FeatureWriter( filename=train_rec_file, is_training=True) convert_examples_to_features( examples=train_examples, sp_model=sp_model, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close()
Example #28
Source File: __init__.py From embedding-as-service with MIT License | 5 votes |
def load_tokenizer(model_path: str): """Get the vocab file and casing info from the Hub module.""" sp_model = spm.SentencePieceProcessor() sp_model.Load(os.path.join(model_path, Embeddings.sentence_piece_model_path)) Embeddings.tokenizer = sp_model
Example #29
Source File: __init__.py From embedding-as-service with MIT License | 5 votes |
def __init__(self): self.sess = tf.Session() self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) self.use_outputs = None self.model_name = None self.max_seq_length = None # placeholder for dan and large model self.sentences = None # sentencepiece and place holder model for lite version self.sp_model = spm.SentencePieceProcessor() self.input_placeholder = None
Example #30
Source File: tokenization_xlnet.py From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 | 5 votes |
def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file)