Python sentencepiece.SentencePieceProcessor() Examples

The following are 30 code examples of sentencepiece.SentencePieceProcessor(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sentencepiece , or try the search function .
Example #1
Source File: vocab.py    From BERT-keras with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name)) 
Example #2
Source File: sp_encoder.py    From ru_transformers with Apache License 2.0 6 votes vote down vote up
def __init__(self, filename, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
        self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens

        if os.path.isdir(filename): filename = os.path.join(filename, self.def_name)

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(filename)
        self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10]
        self.filename = filename
        # for some reason SentencePiece inserts a blank line id before special token if that is the only 
        # token in the line. I'd like to remove that blank line id from encoding.
        nl_ids = self.sp.EncodeAsIds(NEW_LINE)
        assert(len(nl_ids) == 2)
        self.blank_line_id = nl_ids[0] 
Example #3
Source File: tokenizers.py    From virtex with MIT License 6 votes vote down vote up
def __init__(self, vocab_path: str, model_path: str):
        self.vocab_path = vocab_path
        self.model_path = model_path

        # Load pretrained tokenizer model.
        self.model = sp.SentencePieceProcessor()
        self.model.Load(model_path)

        # Load vocabulary mapping (and inverse mapping) between token and id.
        self._token_to_id: Dict[str, int] = {}
        self._id_to_token: Dict[int, str] = {}

        with open(vocab_path, "r") as vocab_file:
            reader = csv.DictReader(
                vocab_file, delimiter="\t", fieldnames=["token", "logprob"]
            )
            for index, row in enumerate(reader):
                self._token_to_id[row["token"]] = index
                self._id_to_token[index] = row["token"] 
Example #4
Source File: tokenization_xlnet.py    From TextClassify with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, max_len=None,
                 do_lower_case=False, remove_space=True, keep_accents=False,
                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
                                             unk_token=unk_token, sep_token=sep_token,
                                             pad_token=pad_token, cls_token=cls_token,
                                             mask_token=mask_token, additional_special_tokens=
                                             additional_special_tokens, **kwargs)
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file) 
Example #5
Source File: gdqn.py    From KG-A2C with MIT License 6 votes vote down vote up
def __init__(self, params):
        configure_logger(params['output_dir'])
        log('Parameters {}'.format(params))
        self.params = params
        self.binding = load_bindings(params['rom_file_path'])
        self.max_word_length = self.binding['max_word_length']
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(params['spm_file'])
        kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp,
                          params['tsv_file'], step_limit=params['reset_steps'],
                          stuck_steps=params['stuck_steps'], gat=params['gat'])
        self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path'])
        self.template_generator = TemplateActionGenerator(self.binding)
        env = FrotzEnv(params['rom_file_path'])
        self.vocab_act, self.vocab_act_rev = load_vocab(env)
        self.model = KGA2C(params, self.template_generator.templates, self.max_word_length,
                           self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda()
        self.batch_size = params['batch_size']
        if params['preload_weights']:
            self.model = torch.load(self.params['preload_weights'])['model']
        self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr'])

        self.loss_fn1 = nn.BCELoss()
        self.loss_fn2 = nn.BCEWithLogitsLoss()
        self.loss_fn3 = nn.MSELoss() 
Example #6
Source File: tokenization.py    From albert with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      tf.logging.info("loading sentence piece model")
      # Handle cases where SP can't load the file, but gfile can.
      sp_model_ = tf.gfile.GFile(spm_model_file, "rb").read()
      self.sp_model.LoadFromSerializedProto(sp_model_)
      # Note(mingdachen): For the purpose of consisent API, we are
      # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      self.vocab = load_vocab(vocab_file)
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    self.inv_vocab = {v: k for k, v in self.vocab.items()} 
Example #7
Source File: vocab.py    From BERT with Apache License 2.0 6 votes vote down vote up
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '
                '--training_sentence_size=100000000'.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name)) 
Example #8
Source File: tokenizers.py    From bert4keras with Apache License 2.0 6 votes vote down vote up
def __init__(self, sp_model_path, *args, **kwargs):
        super(SpTokenizer, self).__init__(*args, **kwargs)
        import sentencepiece as spm
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(sp_model_path)
        self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id())
        self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id())
        self._vocab_size = self.sp_model.get_piece_size()

        for token in ['pad', 'unk', 'mask', 'start', 'end']:
            try:
                _token = getattr(self, '_token_%s' % token)
                _token_id = self.sp_model.piece_to_id(_token)
                setattr(self, '_token_%s_id' % token, _token_id)
            except:
                pass 
Example #9
Source File: test_sentencepiece_tokenizer.py    From espnet with Apache License 2.0 6 votes vote down vote up
def spm_srcs(tmp_path: Path):
    input_text = tmp_path / "text"
    vocabsize = len(string.ascii_letters) + 4
    model_prefix = tmp_path / "model"
    model = str(model_prefix) + ".model"
    input_sentence_size = 100000

    with input_text.open("w") as f:
        f.write(string.ascii_letters + "\n")

    spm.SentencePieceTrainer.Train(
        f"--input={input_text} "
        f"--vocab_size={vocabsize} "
        f"--model_prefix={model_prefix} "
        f"--input_sentence_size={input_sentence_size}"
    )
    sp = spm.SentencePieceProcessor()
    sp.load(model)

    with input_text.open("r") as f:
        vocabs = {"<unk>", "▁"}
        for line in f:
            tokens = sp.DecodePieces(list(line.strip()))
        vocabs |= set(tokens)
    return model, vocabs 
Example #10
Source File: prepare_data.py    From Decoders-Chinese-TF2.0 with MIT License 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='sentencepiece模型地址')
    parser.add_argument('--raw_data_path', default='data/train_test.txt', type=str, required=False, help='原始语料地址')
    parser.add_argument('--save_tfrecord_path', default='data/tokenized/', type=str, required=False, help='处理后的语料存放地址')
    parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录句子长度')
    parser.add_argument('--n_ctx', default=512, type=int, required=False, help='每个训练样本的长度')
    parser.add_argument('--batch_size', default=8, type=int, required=False, help='只用于XL模型,XL模型的batch size,GPT2设置为1')
    parser.add_argument('--pad', default=0, type=int, required=False, help='PAD值')
    parser.add_argument('--epochs', default=1, type=int, required=False, help='只用于XL模型,GPT2设置为1')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    ch_sp = spm.SentencePieceProcessor()
    ch_sp.Load(args.spm_model_path)

    build_tfrecord(args.raw_data_path, args.save_tfrecord_path, ch_sp, args.min_length, args.n_ctx,
                   args.batch_size, pad=args.pad, epochs=args.epochs) 
Example #11
Source File: tokenization.py    From embedding-as-service with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
        self.vocab = None
        self.sp_model = None
        if spm_model_file:
            self.sp_model = spm.SentencePieceProcessor()
            tf.logging.info("loading sentence piece model")
            self.sp_model.Load(spm_model_file)
            # Note(mingdachen): For the purpose of consisent API, we are
            # generating a vocabulary for the sentence piece tokenizer.
            self.vocab = {self.sp_model.IdToPiece(i): i for i
                          in range(self.sp_model.GetPieceSize())}
        else:
            self.vocab = load_vocab(vocab_file)
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.inv_vocab = {v: k for k, v in self.vocab.items()} 
Example #12
Source File: translate_server.py    From nlp-services with MIT License 6 votes vote down vote up
def translate_text(text, source, target):
    if source == target:
        # The easy case ;-)
        return text

    t = translations[source][target]
    s = spm.SentencePieceProcessor()
    s.Load(os.path.join(ROOT_DIR, 'models', t["sentencepiece_model"]))
    pieces = s.encode_as_pieces(text)

    # Ensure any trailing words without terminating punctuation is also translated.
    if pieces[-1] != '.':
        pieces.append('.')
    # For other languages we will need a better system for chunking sentences or parts of text.
    indices = [i for i, _x in enumerate(pieces) if _x in [".", "!", "?"]]
    
    complete_result = []
    start=0
    for i in indices:
        x = " ".join([e for e in pieces[start:i+1]])
        result = _translate(x, translate_model=t['translate_model'])
        y = s.decode_pieces(result[1][0].split(" "))
        complete_result.append(y)
        start = i
    return "\n".join(complete_result) 
Example #13
Source File: tokenization_albert.py    From Bert-Multi-Label-Text-Classification with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    print(spm_model_file)
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      logger.info("loading sentence piece model")
      self.sp_model.Load(str(spm_model_file))
      # # Note(mingdachen): For the purpose of consisent API, we are
      # # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      print("load vocab")
      self.vocab = load_vocab(vocab_file)
      print("load token")
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100)
    self.inv_vocab = {v: k for k, v in self.vocab.items()} 
Example #14
Source File: test_functional.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_generate_sp_model(self):
        """
        Test the function to train a sentencepiece tokenizer.
        """

        asset_name = 'text_normalization_ag_news_test.csv'
        asset_path = get_asset_path(asset_name)
        # We use temporary directory for two reasons:
        # 1. buck (fb internal) generates test environment which contains ',' in its path.
        #    SentencePieceTrainer considers such path as comma-delimited file list.
        #    So as workaround we copy the asset data to temporary directory and load it from there.
        # 2. when fb infra performs stress tests, multiple instances of this test run.
        #    The name of the generated models have to be unique and they need to be cleaned up.
        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)

            model_prefix = os.path.join(dir_name, f'spm_user_{uuid.uuid4()}')
            model_file = f'{model_prefix}.model'
            generate_sp_model(data_path, vocab_size=23456, model_prefix=model_prefix)

            sp_user = spm.SentencePieceProcessor()
            sp_user.Load(model_file)

            self.assertEqual(len(sp_user), 23456) 
Example #15
Source File: pre_process.py    From gpt-2-tensorflow2.0 with MIT License 6 votes vote down vote up
def create_tf_records(min_seq_len, max_seq_len, per_file_limit=50000):
	print("Creating TF Records...............")
	s = spm.SentencePieceProcessor()
	s.Load(BPE_MODEL_PATH + ".model")
	if not os.path.exists(TF_RECORDS):
		os.makedirs(TF_RECORDS)
	filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
	tf_writer = tf.io.TFRecordWriter(filename)
	doc_counts = 0
	with open(PROCESS_DATA_PATH, 'r') as f:
		for line in tqdm.tqdm(f):
			encoded_id = s.encode_as_ids(line)
			if max_seq_len > len(encoded_id) > min_seq_len:
				inputs = np.array([BOS_ID] + encoded_id)
				targets = np.array(encoded_id + [EOS_ID])

				example = serialize_example(inputs, targets)
				tf_writer.write(example)
				doc_counts += 1
			if doc_counts >= per_file_limit:
				tf_writer.write(example)
				doc_counts = 0
				tf_writer.close()
				filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
				tf_writer = tf.io.TFRecordWriter(filename) 
Example #16
Source File: albert_tokenization.py    From bert-for-tf2 with MIT License 6 votes vote down vote up
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
        self.vocab = None
        self.sp_model = None
        if spm_model_file:
            import sentencepiece as spm

            self.sp_model = spm.SentencePieceProcessor()
            tf.compat.v1.logging.info("loading sentence piece model")
            self.sp_model.Load(spm_model_file)
            # Note(mingdachen): For the purpose of consisent API, we are
            # generating a vocabulary for the sentence piece tokenizer.
            self.vocab = {self.sp_model.IdToPiece(i): i for i
                          in range(self.sp_model.GetPieceSize())}
        else:
            self.vocab = load_vocab(vocab_file)
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.inv_vocab = {v: k for k, v in self.vocab.items()} 
Example #17
Source File: sample.py    From gpt-2-tensorflow2.0 with MIT License 6 votes vote down vote up
def load_weights(self):
		with open(self.model_param) as f:
			param = json.load(f)
		self.model = Gpt2(param['num_layers'],
						  param['d_model'],
						  param['num_heads'],
						  param['dff'],
						  param['max_seq_len'],
						  param['vocab_size'])

		ckpt = tf.train.Checkpoint(model=self.model)

		ckpt_manager = tf.train.CheckpointManager(ckpt, self.model_path, max_to_keep=1)

		ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()
		print('Model weights loaded into memory')

		self.sp = spm.SentencePieceProcessor()
		self.sp.load(self.vocab_path) 
Example #18
Source File: tokenization_albert.py    From albert_pytorch with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      logger.info("loading sentence piece model")
      self.sp_model.Load(spm_model_file)
      # # Note(mingdachen): For the purpose of consisent API, we are
      # # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      print("load vocab")
      self.vocab = load_vocab(vocab_file)
      print("load token")
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100)
    self.inv_vocab = {v: k for k, v in self.vocab.items()} 
Example #19
Source File: bpevocabulary.py    From dpu-utils with MIT License 6 votes vote down vote up
def __init__(self, max_size: int, sentencepiece_model_filepath: Optional[str]=None,
                 bos_token: str="<s>", eos_token: str="</s>", unk_token: str="<unk>", pad_token: str="<pad>",
                 user_defined_symbols: Optional[List[str]] = None,
                 control_symbols: Optional[List[str]]=None) -> None:

        self.__max_size=max_size
        self.__bos_token=bos_token
        self.__eos_token=eos_token
        self.__unk_token=unk_token
        self.__pad_token=pad_token

        self.vocab_file = sentencepiece_model_filepath
        if user_defined_symbols is None:
            user_defined_symbols = []
        self.user_defined_symbols=",".join(user_defined_symbols)

        if control_symbols is None:
            control_symbols = self.DEFAULT_CONTROL_SYMBOLS
        self.control_symbols=",".join(control_symbols)

        self.__sp_model = spm.SentencePieceProcessor()
        if sentencepiece_model_filepath is not None:
            self.__load_model_from_filepath(sentencepiece_model_filepath)

    #region Custom Pickling 
Example #20
Source File: tokenization_xlnet.py    From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 6 votes vote down vote up
def __init__(self, vocab_file, max_len=None,
                 do_lower_case=False, remove_space=True, keep_accents=False,
                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
                                             unk_token=unk_token, sep_token=sep_token,
                                             pad_token=pad_token, cls_token=cls_token,
                                             mask_token=mask_token, additional_special_tokens=
                                             additional_special_tokens, **kwargs)
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file) 
Example #21
Source File: sentencepiece_tokenizer.py    From espnet with Apache License 2.0 5 votes vote down vote up
def __init__(self, model: Union[Path, str]):
        assert check_argument_types()
        self.model = str(model)
        # NOTE(kamo):
        # Don't build SentencePieceProcessor in __init__()
        # because it's not picklable and it may cause following error,
        # "TypeError: can't pickle SwigPyObject objects",
        # when giving it as argument of "multiprocessing.Process()".
        self.sp = None 
Example #22
Source File: generate_gpt2_keras.py    From Decoders-Chinese-TF2.0 with MIT License 5 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--device', default='0', type=str, required=False, help='生成设备')
    parser.add_argument('--length', default=50, type=int, required=False, help='生成长度')
    parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度')
    parser.add_argument('--topk', default=5, type=int, required=False, help='最高几选一')
    parser.add_argument('--topp', default=0.95, type=float, required=False, help='最高积累概率')
    parser.add_argument('--model_config', default='configs/gpt2/model_config_small.json', type=str, required=False,
                        help='模型参数')
    parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='')
    parser.add_argument('--model_path', default='model/', type=str, required=False, help='模型路径')
    parser.add_argument('--prefix', default='丨', type=str, required=False, help='生成文章的开头')
    parser.add_argument('--repetition_penalty', default=1.1, type=float, required=False)
    parser.add_argument('--n_ctx', default=512, type=int)
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    model = modeling_gpt2.TFGPT2LMHeadModel.from_pretrained(args.model_path)

    length = args.length
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty
    n_ctx = args.n_ctx

    ch_sp = spm.SentencePieceProcessor()
    ch_sp.Load(args.spm_model_path)

    while True:
        context = ch_sp.encode_as_ids(args.prefix)
        generated = sample_sequence(model, context, length, n_ctx, ch_sp, temperature=temperature, top_k=topk, top_p=topp,
                        repitition_penalty=repetition_penalty)
        print(ch_sp.decode_ids(generated)) 
Example #23
Source File: tokenization_xlm_roberta.py    From exbert with Apache License 2.0 5 votes vote down vote up
def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )
            raise
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file) 
Example #24
Source File: bpe_tokenizer.py    From GPT2-Chinese with MIT License 5 votes vote down vote up
def __init__(self, model_path):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path) 
Example #25
Source File: wordpiece.py    From neural_sp with Apache License 2.0 5 votes vote down vote up
def __init__(self, dict_path, wp_model):
        # Load a dictionary file
        self.idx2token = {0: '<blank>'}
        with codecs.open(dict_path, 'r', 'utf-8') as f:
            for line in f:
                wp, idx = line.strip().split(' ')
                self.idx2token[int(idx)] = wp
        self.vocab = len(self.idx2token.keys())
        # for synchronous bidirectional attention
        self.idx2token[self.vocab] = '<l2r>'
        self.idx2token[self.vocab + 1] = '<r2l>'
        self.idx2token[self.vocab + 2] = '<null>'

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(wp_model) 
Example #26
Source File: wordpiece.py    From neural_sp with Apache License 2.0 5 votes vote down vote up
def __init__(self, dict_path, wp_model):
        # Load a dictionary file
        self.token2idx = {'<blank>': 0}
        with codecs.open(dict_path, 'r', 'utf-8') as f:
            for line in f:
                wp, idx = line.strip().split(' ')
                self.token2idx[wp] = int(idx)
        self.vocab = len(self.token2idx.keys())

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(wp_model) 
Example #27
Source File: run_squad.py    From embedding-as-service with MIT License 5 votes vote down vote up
def preprocess():
  sp_model = spm.SentencePieceProcessor()
  sp_model.Load(FLAGS.spiece_model_file)
  spm_basename = _get_spm_basename()

  train_rec_file = os.path.join(
      FLAGS.output_dir,
      "{}.{}.slen-{}.qlen-{}.train.tf_record".format(
          spm_basename, FLAGS.proc_id, FLAGS.max_seq_length,
          FLAGS.max_query_length))

  tf.logging.info("Read examples from {}".format(FLAGS.train_file))
  train_examples = read_squad_examples(FLAGS.train_file, is_training=True)
  train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc]

  # Pre-shuffle the input to avoid having to make a very large shuffle
  # buffer in the `input_fn`.
  random.shuffle(train_examples)

  tf.logging.info("Write to {}".format(train_rec_file))
  train_writer = FeatureWriter(
      filename=train_rec_file,
      is_training=True)
  convert_examples_to_features(
      examples=train_examples,
      sp_model=sp_model,
      max_seq_length=FLAGS.max_seq_length,
      doc_stride=FLAGS.doc_stride,
      max_query_length=FLAGS.max_query_length,
      is_training=True,
      output_fn=train_writer.process_feature)
  train_writer.close() 
Example #28
Source File: __init__.py    From embedding-as-service with MIT License 5 votes vote down vote up
def load_tokenizer(model_path: str):
        """Get the vocab file and casing info from the Hub module."""
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(os.path.join(model_path, Embeddings.sentence_piece_model_path))
        Embeddings.tokenizer = sp_model 
Example #29
Source File: __init__.py    From embedding-as-service with MIT License 5 votes vote down vote up
def __init__(self):
        self.sess = tf.Session()
        self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        self.use_outputs = None
        self.model_name = None
        self.max_seq_length = None

        # placeholder for dan and large model
        self.sentences = None

        # sentencepiece and place holder model for lite version
        self.sp_model = spm.SentencePieceProcessor()
        self.input_placeholder = None 
Example #30
Source File: tokenization_xlnet.py    From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 5 votes vote down vote up
def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)